### <font color='blue'>**Data Exploration**
    Pulled from Sasya's notebook


In [1]:
import csv
import glob
import random
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
from collections import defaultdict
from pyteomics import  fasta, parser, mass, achrom, electrochem, auxiliary  #, mzxml


In [2]:
peptidoforms_intensity_df = pd.read_csv("data.tsv", sep="\t")
peptidoforms_intensity_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Peptidoform,Peptidoform ID,Unmod peptidoform,Total,Total- Unmodified sequence,Peptidoforms- Unmodified sequence,Proteins,Mass,Charge,Num Mods,...,Patient_M2_healthyMale.Timepoint_2,Patient_M2_healthyMale.Timepoint_2_unmod,Patient_M2_healthyMale.Timepoint_3,Patient_M2_healthyMale.Timepoint_3_unmod,Patient_M3_healthyMale.Timepoint_1,Patient_M3_healthyMale.Timepoint_1_unmod,Patient_M3_healthyMale.Timepoint_2,Patient_M3_healthyMale.Timepoint_2_unmod,Patient_M3_healthyMale.Timepoint_3,Patient_M3_healthyMale.Timepoint_3_unmod
0,.SPLFM+15.995GK.,SPLFM+15.995GK,.SPLFMGK.,11679,11681,2,sp|P01009|A1AT_HUMAN;tr|A0A024R6I7|A0A024R6I7_...,795.406,2,1,...,,84453000.0,,92032000.0,,,,51720000.0,,49329000.0
1,.EPQVYTLPPSREEM+15.995TK.,EPQVYTLPPSREEM+15.995TK,.EPQVYTLPPSREEMTK.,11411,13133,46,sp|P01859|IGHG2_HUMAN;sp|P01860|IGHG3_HUMAN;tr...,1920.9381,2,1,...,,,,,,,,,,
2,.AVM+15.995DDFAAFVEK.,AVM+15.995DDFAAFVEK,.AVMDDFAAFVEK.,10949,18495,81,sp|P02768-2|ALBU_HUMAN;sp|P02768-3|ALBU_HUMAN;...,1358.6318,2,1,...,,1911700000.0,,1625600000.0,,43830000.0,,1484500000.0,,1147100000.0
3,.EFNAETFTFHADIC-33.988TLSEK.,EFNAETFTFHADIC-33.988TLSEK,.EFNAETFTFHADICTLSEK.,10198,22966,174,sp|P02768-2|ALBU_HUMAN;sp|P02768-3|ALBU_HUMAN;...,2169.0257,2,1,...,,,,,,,,,,
4,.M+15.995ADEAGSEADHEGTHSTK.,M+15.995ADEAGSEADHEGTHSTK,.MADEAGSEADHEGTHSTK.,9608,18724,131,sp|P02671-2|FIBA_HUMAN;sp|P02671|FIBA_HUMAN,1888.7728,2,1,...,,22109000.0,,1078800000.0,,6243600.0,,1199000000.0,,644940000.0


#### <font color='purple'>Removing 'unmod' columns (And last few columns that do not adhere to the 'Patient_##.Timepoint_#' format

In [3]:
df = peptidoforms_intensity_df.iloc[:, :667]
pep_reduced_intensity_df = df[df.columns.drop(list(df.filter(regex='.\_unmod')))]

In [4]:
print("Original dimensions -", peptidoforms_intensity_df.shape)
print("Modified dimensions -", pep_reduced_intensity_df.shape)

Original dimensions - (40921, 704)
Modified dimensions - (40921, 350)


---

### <font color='blue'>**Data Processing**

From [link](https://docs.google.com/document/d/19OM5dATKRMVfWUJdXw7VlOGRBuQJO7lAYuaBr2coeLg/edit) (page 7):

First, since different peptides have different ranges of absolute intensity values measured across the various samples, it is recommended that the values of each peptide be normalized to z-scores across all samples where each peptide is observed - this centers the observations of each peptide on its mean value and displays its variation in the context of each peptide’s standard deviation across all samples


In [5]:
def get_tsv_data(path):
    with open(path) as tsv:
        reader = csv.reader(tsv, dialect="excel-tab")
        header = next(reader) #skip header
        for line in reader:
            yield list(zip(header, line))


            
def get_mzxml_filenames(dir_path):
    for mzxml_filepath in glob.glob(f"{dir_path}/*.mzXML"):
        yield mzxml_filepath

In [6]:
class Data:
    def __init__(self, patient, pepti, intensity):
        '''
        patient is a list of patient name corresponding to x-asix in intensity
        pepti is a list of pepti id corresponsing to y-axis in intensity
        intensity is a 2D numpy array what stores intensity data of patient and pepti
        '''
        self.patient = patient
        self.pepti = pepti
        self.intensity = intensity 

    @staticmethod
    def intensity_string_to_int(intensity):
        if intensity == "N/A":
            return 0
        return int(intensity.replace(',', ''))

    @classmethod
    def fromFilePath(cls, path, pepti_id_col = 1, start_col = 32, end_col = 667):
        #count line of file
        num_pepti = 0
        for _ in open(path): 
            num_pepti += 1
        num_pepti -= 1 # remove header

        #open file
        with open(path) as tsv:
            reader = csv.reader(tsv, dialect="excel-tab")
            patient = next(reader)[start_col:end_col:2] #header after 32 is patient name with time stamp
            #added end_col to signify end of cols with patient name in standard format (Patient_##.Timepoint_#)
            #added step=2 to skip all the patients with 'unmod'
            #patient = [ x for x in pat if "Timepoint_1" in x ]
            #indexes = [ (pat.index(x)+start_col) for x in patient ] #Indexes of all the columns with Patient names at Timepoint_1

            pepti = [""] * num_pepti
            intensity_data = np.empty((num_pepti, len(patient)), int)

            for i, line in enumerate(reader):
                pepti[i] = line[ pepti_id_col ]
                intensity_data[i,:] = np.array([ cls.intensity_string_to_int(l) for l in line[start_col:end_col:2]])
            
            '''print("\nPatient - ", patient)
            print("\nPeptides - ", pepti)
            print("\nIntensities - ", intensity_data)'''

        return cls(patient, pepti, intensity_data)

    def get_patient_from_index(self,p_index):
        return self.intensity[:,p_index]

    def get_patient(self, patient):
        p_index = self.patient.index(patient)
        return self.get_patient_from_index(p_index)

    def get_patient_list(self, patient_list):
        re = np.zeros((len(self.pepti), len(patient_list)))
        for i, pat in enumerate(patient_list):
            re[:,i] = self.get_patient(pat)
        return re

    def get_pepti_from_index(self,p_index):
        return self.intensity[p_index,:]

    def get_pepti(self, pepti):
        p_index = self.pepti.index(pepti)
        return self.get_pepti_from_index(p_index)

    def get_pepti_list(self, pepti_list):
        re = np.zeros((len(pepti_list), len(self.patient)),int)
        for i, pep in enumerate(pepti_list):
            re[i,:] = self.get_pepti(pep)
        return re

In [7]:
#loading data
data = Data.fromFilePath("data.tsv")

In [8]:
#Checking to see if all patients have info for all 7 timepoints

from collections import Counter
patient_names = []

for i in range(len(data.patient)):
  patient_names.append(data.patient[i][8:10])

print(Counter(patient_names))

Counter({'01': 7, '02': 7, '03': 7, '05': 7, '06': 7, '07': 7, '08': 7, '09': 7, '10': 7, '11': 7, '12': 7, '14': 7, '15': 7, '16': 7, '17': 7, '19': 7, '20': 7, '21': 7, '22': 7, '23': 7, '25': 7, '26': 7, '27': 7, '28': 7, '29': 7, '30': 7, '34': 7, '36': 7, '37': 7, '38': 7, '39': 7, '40': 7, '41': 7, '42': 7, '43': 7, '45': 7, '46': 7, '47': 7, '48': 7, '52': 7, '57': 7, '58': 7, '44': 6, '24': 2, '31': 2, '33': 2, '35': 2, '49': 2, '50': 2, '51': 2, '53': 2, '54': 2})


#### <font color='purple'>Patients with data less than 7 timepoints -

* '44': 6 
* '24': 2
* '31': 2
* '33': 2
* '35': 2 
* '49': 2 
* '50': 2 
* '51': 2 
* '53': 2 
* '54': 2

---

In [9]:
#showing what data contains
print(data.intensity.shape)

print(len(data.patient))
print(data.patient)

print(len(data.pepti))
print(data.pepti[:5])

(40921, 318)
318
['Patient_01.Timepoint_1', 'Patient_01.Timepoint_2', 'Patient_01.Timepoint_3', 'Patient_01.Timepoint_4', 'Patient_01.Timepoint_5', 'Patient_01.Timepoint_6', 'Patient_01.Timepoint_7', 'Patient_02.Timepoint_1', 'Patient_02.Timepoint_2', 'Patient_02.Timepoint_3', 'Patient_02.Timepoint_4', 'Patient_02.Timepoint_5', 'Patient_02.Timepoint_6', 'Patient_02.Timepoint_7', 'Patient_03.Timepoint_1', 'Patient_03.Timepoint_2', 'Patient_03.Timepoint_3', 'Patient_03.Timepoint_4', 'Patient_03.Timepoint_5', 'Patient_03.Timepoint_6', 'Patient_03.Timepoint_7', 'Patient_05.Timepoint_1', 'Patient_05.Timepoint_2', 'Patient_05.Timepoint_3', 'Patient_05.Timepoint_4', 'Patient_05.Timepoint_5', 'Patient_05.Timepoint_6', 'Patient_05.Timepoint_7', 'Patient_06.Timepoint_1', 'Patient_06.Timepoint_2', 'Patient_06.Timepoint_3', 'Patient_06.Timepoint_4', 'Patient_06.Timepoint_5', 'Patient_06.Timepoint_6', 'Patient_06.Timepoint_7', 'Patient_07.Timepoint_1', 'Patient_07.Timepoint_2', 'Patient_07.Timepoin

In [10]:
#testing some get function
print(len(data.get_patient('Patient_01.Timepoint_1')))
print(len(data.get_pepti('SPLFM+15.995GK')))

temp_patient = data.get_patient_list(['Patient_01.Timepoint_1', 'Patient_02.Timepoint_1', 'Patient_03.Timepoint_1'])
print(temp_patient.shape)

temp_papti = data.get_pepti_list(['SPLFM+15.995GK', 'AVM+15.995DDFAAFVEK', 'M+15.995ADEAGSEADHEGTHSTK'])
print(temp_papti.shape)

40921
318
(40921, 3)
(3, 318)


52 total patients resulting in 318 total timepoints. 40921 unique peptides

#### <font color='purple'> Patients without Timepoint_1 info -

04, 13, 18, 32, 44, 55, 56

### <font color='blue'>**Exploratory Analysis**

In [82]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tqdm import tqdm

patient_list = data.patient
peptide_list = data.pepti

intensity_in_patients = data.get_pepti_list(peptide_list)

pepDict = defaultdict(list)
pepHigh = defaultdict(list)
pepAvg = defaultdict(list)
for i in tqdm(range(len(peptide_list))):
    pepDict[peptide_list[i]] = np.count_nonzero(intensity_in_patients[i])
    pepHigh[peptide_list[i]] = max(intensity_in_patients[i])
    pepAvg[peptide_list[i]] = np.mean(intensity_in_patients[i])

100%|██████████| 40921/40921 [00:01<00:00, 30257.86it/s]


In [89]:
pCounts = [(pepDict[w], w) for w in pepDict]
pCounts.sort()

pHigh = [(pepHigh[w], w) for w in pepHigh]
pHigh.sort()
pHigh.reverse()

pAvg = [(pepAvg[w], w) for w in pepAvg]
pAvg.sort()
pAvg.reverse()

In [90]:
pCounts[20800:20861]

[(0, 'YVMLPVADQYDC+40.00ITHYE'),
 (0, 'YVMLPVADQYDC+464.35ITHYE'),
 (0, 'YVMLPVADQYDC+54.09ITHYEGSTC+57.021PK'),
 (0, 'YVMLPVADQYDC+54.10ITHYEGSTC+57.021PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+166.09PKWKAPK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+30.05PKWK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+317.10PKWKAPK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+344.21PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+54.00PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+54.06PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+54.09PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+54.10PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+54.12PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC+57.021PKW-55.039KAPK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC-40.96PKWKA'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC-40.99PKWKA'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC-59.02PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC-59.03PK'),
 (0, 'YVMLPVADQYDC+57.021ITHYEGSTC-59.06PK'),
 (0, 'YVMLPVADQYDC-40.95ITHYEGSTC+57.021PKWKA'),
 (0, 'YVMLPVADQYDC-40.99ITHYEGSTC+57.021PKWKA'),
 (0, 'YVMLPVADQYDC-

In [91]:
pCounts.reverse()
pCounts[:620]

[(318, 'YYWGGQYTWDM+15.995AK'),
 (318, 'YYC-33.988FQGNQFLR'),
 (318, 'YVTSAPM+15.995PEPQAPGR'),
 (318, 'YVGGQEHFAH+57.021LLILR'),
 (318, 'YLQEIYN+0.984SNNQK'),
 (318, 'YIFHNFM+15.995ER'),
 (318, 'YIC-33.988ENQDSISSK'),
 (318, 'YIC+47.985ENQDSISSK'),
 (318, 'YIC+305.068ENQDSISSK'),
 (318, 'YIC+24.995ENQDSISSK'),
 (318, 'YIC+119.004ENQDSISSK'),
 (318, 'YFKPGMPFDLM+15.995VFVTNPDGSPAYR'),
 (318, 'YFKPGM+15.995PFDLMVFVTNPDGSPAYR'),
 (318, 'YEASILTHDSSIRY-34.968'),
 (318, 'YEASILTH+57.021DSSIR'),
 (318, 'YDVENC-33.988LANK'),
 (318, 'YAASSYLSLTPEQW+3.995K'),
 (318, 'WYVDGVEVHN+0.984AK'),
 (318, 'WQQGNVFSC-33.988SVMHEALHNHYTQK'),
 (318, 'WQEEM+15.995ELYR'),
 (318, 'WC-33.988AVSEHEATK'),
 (318, 'WC-33.988ALSHHER'),
 (318, 'W+3.995LQGSQELPR'),
 (318, 'VYAC-33.988EVTHQGLSSPVTK'),
 (318, 'VYAC+57.021E+31.990VTHQGLSSPVTK'),
 (318, 'VYAC+305.068EVTHQGLSSPVTK'),
 (318, 'VYAC+119.004EVTHQGLSSPVTK'),
 (318, 'VVSVLTVVHQDWLN-17.027GK'),
 (318, 'VVSVLTVVHQDWLN+0.984GK'),
 (318, 'VVSVLTVVHQDW+31.990LNGK'),

In [95]:
pHigh[:20]

[(22547750000, 'DVFLGM+15.995FLYEYAR'),
 (21307500000, '+43.006C+58.005TESLVNR'),
 (20906000000, 'VVSVLTVLHQDW+17.991LN-16.990GK'),
 (20126000000, 'AEFAEVSK+42.047'),
 (19390000000, 'VFDEFKPLVEEPQ+0.984NLIK'),
 (19230000000, 'LVN+0.984EVTEFAK'),
 (19047000000, 'VVSVLTVLHQ+19.010D-18.011WLNGK'),
 (18864000000, 'ETEGLRQEM+16.028SK'),
 (18381625000, 'ETEGLRQEM+15.995SK'),
 (17832225000, 'VVSVLTVLHQDWLN+0.984GK'),
 (17670000000, 'C+31.990C+57.021E+37.956KPLLEK'),
 (17577500000, '+27.995C+57.021T+15.995ESLVNR'),
 (17241500000, 'VFSN+0.984GADLSGVTEEAPLK'),
 (16977500000, 'DTLM+15.995ISR'),
 (15871000000, 'QTALVELVKH-8.964'),
 (15599000000, 'VVSVLTVLHQ+0.984DWLNGK'),
 (15209650000, '-17.027QGLLPVLESFK'),
 (15143000000, 'DVFLGM+16.028FLYEYAR'),
 (14992250000, 'LSPLGEEM+15.995RDR'),
 (14403373333, 'VFDEFKPLVEEPQN+0.984LIK')]

In [96]:
pAvg[:20]

[(10221774482.704403, 'DTLM+15.995ISR'),
 (8561755845.544025, 'VVSVLTVLHQDWLN+0.984GK'),
 (8104795220.91195, 'ETEGLRQEM+15.995SK'),
 (7286033878.144654, 'AVM+15.995DDFAAFVEK'),
 (6589125526.773585, 'AEFAEVSK+42.047'),
 (6461229646.226415, '-17.027QHLPLIK'),
 (6459944716.9811325, 'VFSN+0.984GADLSGVTEEAPLK'),
 (5980347654.795597, 'LSPLGEEM+15.995RDR'),
 (5976378734.27673, 'RPC-33.988FSALEVDETYVPK'),
 (5767392362.704403, 'DVFLGM+15.995FLYEYAR'),
 (5579700956.053459, '-17.027QGLLPVLESFK'),
 (5396870665.880503, '-17.027QTALVELVK'),
 (5329950116.798742, '+12.000C+57.021TESLVNR'),
 (5244984847.562893, 'YIC-33.988ENQDSISSK'),
 (5140524043.342768, '+43.006C+58.005TESLVNR'),
 (4933536894.654088, 'EFNAETFTFHADIC-33.988TLSEK'),
 (4906313474.056603, 'SM+15.995GGKEDLIWELLNQAQEHFGK'),
 (4382400487.163522, 'LVN+0.984EVTEFAK'),
 (4297733938.679245, 'FQN+0.984ALLVR'),
 (4276837507.075472, 'KWQEEM+15.995ELYR')]

There are 20860 peptides that do not appear in any patients and 620 peptides that appear in all patients.

---

### <font color='blue'>**Creating Train/Test Datasets**

In [13]:
class PatientDiffLoader(Dataset):
    # UNTESTED !!!
    def __init__(self, path):
        self.data = Data.fromFilePath(path)
    def __len__(self):
        return len(self.data.patient)

    def __getitem__(self, idx, mode="same_patient"):
        diff = bool(random.getrandbits(1))
        
        #print(self.data.patient[idx])
        pid = int(self.data.patient[idx][8:10])
        tp = int(self.data.patient[idx][21])

        label = None
        data_1 = self.data.get_patient_from_index(idx)
        data_2 = None
        if diff:
            #different class
            label = 0.0

            #get a differnt pid
            diff_pid = -1
            while True:
                diff_pid = random.randint(1,58) #there is 58 patients? TODO: make sure it's 58
                if diff_pid != pid:
                    break

            #get diff data
            random_diff = f"Patient_{diff_pid:02d}.Timepoint_{random.randint(1,7)}"
            data_2 = self.data.get_patient( random_diff )
            
        else:
            #same class
            label = 1.0
            random_same = f"Patient_{pid:02d}.Timepoint_{random.randint(1,7)}"
            data_2 = self.data.get_patient( random_same )

        return data_1, data_2, torch.from_numpy(np.array([label], dtype=np.float32))

In [14]:
full_dataset = PatientDiffLoader("./drive/Shareddrives/CSE 291C/Data/peptidoforms_intensity/data.tsv")

In [15]:
full_dataset.__len__()

318

In [16]:
X = []
Y = []

for i in range(full_dataset.__len__()):
  try:
    data_1, data_2, label = full_dataset.__getitem__(i)
    X.append(data_1 + data_2)
    Y.append(label)
  except Exception as e:
    print(e)
    i -= 1

'Patient_04.Timepoint_6' is not in list
'Patient_18.Timepoint_4' is not in list
'Patient_55.Timepoint_3' is not in list
'Patient_51.Timepoint_6' is not in list
'Patient_32.Timepoint_1' is not in list
'Patient_55.Timepoint_7' is not in list
'Patient_04.Timepoint_5' is not in list
'Patient_50.Timepoint_3' is not in list
'Patient_32.Timepoint_1' is not in list
'Patient_53.Timepoint_3' is not in list
'Patient_18.Timepoint_3' is not in list
'Patient_24.Timepoint_7' is not in list
'Patient_55.Timepoint_6' is not in list
'Patient_24.Timepoint_5' is not in list
'Patient_49.Timepoint_6' is not in list
'Patient_56.Timepoint_5' is not in list
'Patient_13.Timepoint_4' is not in list
'Patient_18.Timepoint_5' is not in list
'Patient_31.Timepoint_3' is not in list
'Patient_54.Timepoint_3' is not in list
'Patient_56.Timepoint_4' is not in list
'Patient_31.Timepoint_6' is not in list
'Patient_50.Timepoint_7' is not in list
'Patient_33.Timepoint_4' is not in list
'Patient_33.Timepoint_4' is not in list


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [18]:
len(X_train)

216

In [19]:
X = X_train
Y = y_train
X_V = X_test
Y_V = y_test

---

### <font color='blue'>**Logistic Regression (Rohan)**

In [20]:
import sklearn
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', max_iter=100000)
model.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# helper method to print basic model metrics
def metrics(y_true, y_pred):
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))
    print('\nReport:\n', classification_report(y_true, y_pred))

In [22]:
logreg_acc = []

y_pred = model.predict(X_V) 
metrics(Y_V, y_pred) # finally evaluate performance
logreg_acc.append(sklearn.metrics.accuracy_score(Y_V, y_pred))
#logreg_acc.append(model.score(Y_V, y_pred))

Confusion matrix:
 [[12 16]
 [14 12]]

Report:
               precision    recall  f1-score   support

         0.0       0.46      0.43      0.44        28
         1.0       0.43      0.46      0.44        26

    accuracy                           0.44        54
   macro avg       0.45      0.45      0.44        54
weighted avg       0.45      0.44      0.44        54



---

In [23]:
print(len(X))
print(len(Y))
print(len(X_V))
print(len(Y_V))

216
216
54
54


In [24]:
from numpy import array

a = array(X)
print(a.shape)

(216, 40921)


---

### <font color='blue'>**PCA + Logistic Regression**

##### **99% Information**

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X)

# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

# Make an instance of the Model
pca = PCA(.99)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

logisticRegr = LogisticRegression(solver = 'lbfgs', max_iter=100000)
logisticRegr.fit(train_data, Y)

metrics(Y_V, logisticRegr.predict(test_data))
logreg_acc.append(logisticRegr.score(test_data, Y_V))
print("\nAccuracy - ", logisticRegr.score(test_data, Y_V))

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 181) 

Confusion matrix:
 [[12 16]
 [11 15]]

Report:
               precision    recall  f1-score   support

         0.0       0.52      0.43      0.47        28
         1.0       0.48      0.58      0.53        26

    accuracy                           0.50        54
   macro avg       0.50      0.50      0.50        54
weighted avg       0.50      0.50      0.50        54


Accuracy -  0.5


##### **95% Information**

In [26]:
# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

# Make an instance of the Model
pca = PCA(.95)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

logisticRegr = LogisticRegression(solver = 'lbfgs', max_iter=100000)
logisticRegr.fit(train_data, Y)

metrics(Y_V, logisticRegr.predict(test_data))
logreg_acc.append(logisticRegr.score(test_data, Y_V))
print("\nAccuracy - ", logisticRegr.score(test_data, Y_V))

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 144) 

Confusion matrix:
 [[14 14]
 [13 13]]

Report:
               precision    recall  f1-score   support

         0.0       0.52      0.50      0.51        28
         1.0       0.48      0.50      0.49        26

    accuracy                           0.50        54
   macro avg       0.50      0.50      0.50        54
weighted avg       0.50      0.50      0.50        54


Accuracy -  0.5


##### **94% Information** (Just playing around)

In [27]:
# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

# Make an instance of the Model
pca = PCA(.94)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

logisticRegr = LogisticRegression(solver = 'lbfgs', max_iter=100000)
logisticRegr.fit(train_data, Y)

metrics(Y_V, logisticRegr.predict(test_data))
print("\nAccuracy - ", logisticRegr.score(test_data, Y_V))

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 137) 

Confusion matrix:
 [[12 16]
 [ 9 17]]

Report:
               precision    recall  f1-score   support

         0.0       0.57      0.43      0.49        28
         1.0       0.52      0.65      0.58        26

    accuracy                           0.54        54
   macro avg       0.54      0.54      0.53        54
weighted avg       0.54      0.54      0.53        54


Accuracy -  0.5370370370370371


---

### <font color='blue'>**PCA + SGD Classifier**

##### **Before PCA**

In [28]:
SGD_acc = []

from sklearn.linear_model import SGDClassifier

# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
test_data = scaler.transform(X_V)

clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=100000)
clf.fit(train_data, Y)

metrics(Y_V, clf.predict(test_data))
SGD_acc.append(clf.score(test_data, Y_V))
print("\nAccuracy - ", clf.score(test_data, Y_V))

Confusion matrix:
 [[15 13]
 [11 15]]

Report:
               precision    recall  f1-score   support

         0.0       0.58      0.54      0.56        28
         1.0       0.54      0.58      0.56        26

    accuracy                           0.56        54
   macro avg       0.56      0.56      0.56        54
weighted avg       0.56      0.56      0.56        54


Accuracy -  0.5555555555555556


##### **99% Information**

In [29]:
# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

# Make an instance of the Model
pca = PCA(.99)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=100000)
clf.fit(train_data, Y)

metrics(Y_V, clf.predict(test_data))
SGD_acc.append(clf.score(test_data, Y_V))
print("\nAccuracy - ", clf.score(test_data, Y_V))

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 181) 

Confusion matrix:
 [[12 16]
 [12 14]]

Report:
               precision    recall  f1-score   support

         0.0       0.50      0.43      0.46        28
         1.0       0.47      0.54      0.50        26

    accuracy                           0.48        54
   macro avg       0.48      0.48      0.48        54
weighted avg       0.48      0.48      0.48        54


Accuracy -  0.48148148148148145


##### **95% Information**

In [30]:
# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

# Make an instance of the Model
pca = PCA(.95)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=100000)
clf.fit(train_data, Y)

metrics(Y_V, clf.predict(test_data))
SGD_acc.append(clf.score(test_data, Y_V))
print("\nAccuracy - ", clf.score(test_data, Y_V))

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 144) 

Confusion matrix:
 [[13 15]
 [12 14]]

Report:
               precision    recall  f1-score   support

         0.0       0.52      0.46      0.49        28
         1.0       0.48      0.54      0.51        26

    accuracy                           0.50        54
   macro avg       0.50      0.50      0.50        54
weighted avg       0.50      0.50      0.50        54


Accuracy -  0.5


---

### <font color='blue'>**PCA + XGBoost Classifier**

##### **Before PCA**

In [31]:
XGB_acc = []

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
test_data = scaler.transform(X_V)

model = XGBClassifier()
model.fit(train_data, Y)

y_pred = model.predict(test_data)
predictions = [round(value) for value in y_pred]

metrics(Y_V, model.predict(test_data))
accuracy = accuracy_score(Y_V, predictions)
XGB_acc.append(accuracy)

print("Accuracy - ", accuracy)

Confusion matrix:
 [[23  5]
 [ 2 24]]

Report:
               precision    recall  f1-score   support

         0.0       0.92      0.82      0.87        28
         1.0       0.83      0.92      0.87        26

    accuracy                           0.87        54
   macro avg       0.87      0.87      0.87        54
weighted avg       0.88      0.87      0.87        54

Accuracy -  0.8703703703703703


##### **99% Information**

In [32]:
# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

pca = PCA(.99)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

model = XGBClassifier()
model.fit(train_data, Y)

y_pred = model.predict(test_data)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(Y_V, predictions)

metrics(Y_V, model.predict(test_data))
XGB_acc.append(accuracy)
print("\nAccuracy - ", accuracy)

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 181) 

Confusion matrix:
 [[13 15]
 [ 9 17]]

Report:
               precision    recall  f1-score   support

         0.0       0.59      0.46      0.52        28
         1.0       0.53      0.65      0.59        26

    accuracy                           0.56        54
   macro avg       0.56      0.56      0.55        54
weighted avg       0.56      0.56      0.55        54


Accuracy -  0.5555555555555556


##### **95% Information**

In [33]:
# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

pca = PCA(.95)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

model = XGBClassifier()
model.fit(train_data, Y)

y_pred = model.predict(test_data)
predictions = [round(value) for value in y_pred]

metrics(Y_V, model.predict(test_data))
accuracy = accuracy_score(Y_V, predictions)

XGB_acc.append(accuracy)
print("\nAccuracy - ", accuracy)

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 144) 

Confusion matrix:
 [[14 14]
 [10 16]]

Report:
               precision    recall  f1-score   support

         0.0       0.58      0.50      0.54        28
         1.0       0.53      0.62      0.57        26

    accuracy                           0.56        54
   macro avg       0.56      0.56      0.55        54
weighted avg       0.56      0.56      0.55        54


Accuracy -  0.5555555555555556


---

### <font color='blue'>**PCA + Random Forest Classifier**

##### **Before PCA**

In [34]:
RF_acc = []

from sklearn.ensemble import RandomForestClassifier

# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
test_data = scaler.transform(X_V)

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_data, Y)

metrics(Y_V, clf.predict(test_data))
RF_acc.append(clf.score(test_data, Y_V))
print("\nAccuracy - ", clf.score(test_data, Y_V))

Confusion matrix:
 [[19  9]
 [ 1 25]]

Report:
               precision    recall  f1-score   support

         0.0       0.95      0.68      0.79        28
         1.0       0.74      0.96      0.83        26

    accuracy                           0.81        54
   macro avg       0.84      0.82      0.81        54
weighted avg       0.85      0.81      0.81        54


Accuracy -  0.8148148148148148


##### **99% Information**

In [35]:
# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

pca = PCA(.99)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_data, Y)

metrics(Y_V, clf.predict(test_data))
RF_acc.append(clf.score(test_data, Y_V))
print("\nAccuracy - ", clf.score(test_data, Y_V))

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 181) 

Confusion matrix:
 [[ 1 27]
 [ 2 24]]

Report:
               precision    recall  f1-score   support

         0.0       0.33      0.04      0.06        28
         1.0       0.47      0.92      0.62        26

    accuracy                           0.46        54
   macro avg       0.40      0.48      0.34        54
weighted avg       0.40      0.46      0.33        54


Accuracy -  0.46296296296296297


##### **95% Information**

In [36]:
# Apply transform to both the training set and the test set.
train_data = scaler.transform(X)
print("Initial Dimension -", train_data.shape)
test_data = scaler.transform(X_V)

pca = PCA(.95)
pca.fit(train_data)

train_data = pca.transform(train_data)
print("Transformed Dimension -", train_data.shape, "\n")
test_data = pca.transform(test_data)

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_data, Y)

metrics(Y_V, clf.predict(test_data))
RF_acc.append(clf.score(test_data, Y_V))
print("\nAccuracy - ", clf.score(test_data, Y_V))

Initial Dimension - (216, 40921)
Transformed Dimension - (216, 144) 

Confusion matrix:
 [[ 2 26]
 [ 2 24]]

Report:
               precision    recall  f1-score   support

         0.0       0.50      0.07      0.12        28
         1.0       0.48      0.92      0.63        26

    accuracy                           0.48        54
   macro avg       0.49      0.50      0.38        54
weighted avg       0.49      0.48      0.37        54


Accuracy -  0.48148148148148145


---

### <font color='blue'>**Accuracies**

In [37]:
acc = pd.DataFrame({'Model': ['Without PCA', 'PCA (99)', 'PCA (95)'],
                   'Log. Reg.': logreg_acc,
                   'SGD': SGD_acc,
                   'XGBoost': XGB_acc,
                   'Ran. Forest': RF_acc})

In [38]:
acc.to_csv('./drive/Shareddrives/CSE 291C/ML_accuracies_new.csv')

In [39]:
acc

Unnamed: 0,Model,Log. Reg.,SGD,XGBoost,Ran. Forest
0,Without PCA,0.444444,0.555556,0.87037,0.814815
1,PCA (99),0.5,0.481481,0.555556,0.462963
2,PCA (95),0.5,0.5,0.555556,0.481481


In [42]:
import pandas as pd

acc_df = pd.read_csv('./drive/Shareddrives/CSE 291C/ML_accuracies_new.csv')

In [43]:
acc_df

Unnamed: 0.1,Unnamed: 0,Model,Log. Reg.,SGD,XGBoost,Ran. Forest
0,0,Without PCA,0.444444,0.555556,0.87037,0.814815
1,1,PCA (99),0.5,0.481481,0.555556,0.462963
2,2,PCA (95),0.5,0.5,0.555556,0.481481
