# Connect To Drive

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler, PowerTransformer, PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report

# Load Data

In [29]:
data = pd.read_csv('/content/drive/MyDrive/colabdata/train.csv')

In [30]:
data.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [31]:
X = data.drop(['Id','Class'],axis=1)

In [32]:
X.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,11.626917,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,14.852022,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,13.666727,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614


In [33]:
X['EJ'].replace(to_replace=['A','B'],value=[0,1],inplace=True)
X = X.astype(float)

In [34]:
y = data.drop(X,axis=1)
y = y.drop(['Id'],axis=1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle=True,test_size=0.2,random_state=4)

In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 56 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AB      617 non-null    float64
 1   AF      617 non-null    float64
 2   AH      617 non-null    float64
 3   AM      617 non-null    float64
 4   AR      617 non-null    float64
 5   AX      617 non-null    float64
 6   AY      617 non-null    float64
 7   AZ      617 non-null    float64
 8   BC      617 non-null    float64
 9   BD      617 non-null    float64
 10  BN      617 non-null    float64
 11  BP      617 non-null    float64
 12  BQ      557 non-null    float64
 13  BR      617 non-null    float64
 14  BZ      617 non-null    float64
 15  CB      615 non-null    float64
 16  CC      614 non-null    float64
 17  CD      617 non-null    float64
 18  CF      617 non-null    float64
 19  CH      617 non-null    float64
 20  CL      617 non-null    float64
 21  CR      617 non-null    float64
 22  CS

# Preprocessing

## Imputation

SimpleImputer
KNN Imputer
Iterative Imputer

In [37]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Scaling


In [38]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [39]:
# scaler = MaxAbsScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [40]:
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [41]:
# scaler = RobustScaler(quantile_range=(40.0,60.0))
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [42]:
# scaler = PowerTransformer()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [43]:
poly = PolynomialFeatures(degree=2)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

In [44]:
pca = PCA(whiten=False,random_state=56)
pca.fit(X_train)
pca.explained_variance_ratio_ * 100

array([4.52320296e+01, 3.27649350e+01, 1.99566827e+01, 9.15670970e-01,
       8.51844478e-01, 9.31064172e-02, 5.29557848e-02, 3.92022714e-02,
       3.30974088e-02, 2.17160788e-02, 1.05936413e-02, 6.90465594e-03,
       6.13510894e-03, 4.60763961e-03, 2.27139972e-03, 1.58028899e-03,
       1.08934261e-03, 8.66800365e-04, 7.38030994e-04, 6.95585307e-04,
       5.99848297e-04, 4.28983317e-04, 4.17196590e-04, 3.77039607e-04,
       1.97040279e-04, 1.80409271e-04, 1.63124814e-04, 1.13634918e-04,
       1.02673616e-04, 9.08929692e-05, 7.21162841e-05, 6.21026389e-05,
       4.92223779e-05, 4.56033598e-05, 3.73637627e-05, 3.27071381e-05,
       2.79124429e-05, 2.30561227e-05, 2.04127176e-05, 1.98056574e-05,
       1.79006756e-05, 1.73474790e-05, 1.50288972e-05, 1.34604388e-05,
       1.26444608e-05, 1.23628513e-05, 1.15996294e-05, 9.95746797e-06,
       8.89500125e-06, 8.18051936e-06, 7.69642884e-06, 6.88491380e-06,
       6.48446625e-06, 5.39722201e-06, 4.71335720e-06, 4.36072231e-06,
      

# Algorithm

In [45]:
logreg = LogisticRegression(C=0.75,random_state=57,solver='liblinear')
logreg.fit(X_train,y_train)
print(logreg.score(X_train,y_train))
print(logreg.score(X_test,y_test))

  y = column_or_1d(y, warn=True)


0.896551724137931
0.9032258064516129




In [46]:
supvm = SVC(C=0.1,kernel='poly',degree=4,random_state=67)
supvm.fit(X_train,y_train)
print(supvm.score(X_train,y_train))
print(supvm.score(X_test,y_test))

  y = column_or_1d(y, warn=True)


0.8275862068965517
0.8548387096774194


In [47]:
knn = KNeighborsClassifier(n_neighbors=50,p=1)
knn.fit(X_train,y_train)
print(knn.score(X_train,y_train))
print(knn.score(X_test,y_test))

  return self._fit(X, y)


0.8174442190669371
0.8548387096774194


In [48]:
dtree = DecisionTreeClassifier(criterion='entropy',random_state=200,ccp_alpha=1)
dtree.fit(X_train,y_train)
print(dtree.score(X_train,y_train))
print(dtree.score(X_test,y_test))

0.8174442190669371
0.8548387096774194


In [49]:
rfc = RandomForestClassifier(criterion='gini',ccp_alpha=1)
rfc.fit(X_train,y_train)
print(rfc.score(X_train,y_train))
print(rfc.score(X_test,y_test))

  rfc.fit(X_train,y_train)


0.8174442190669371
0.8548387096774194


In [50]:
kmeans = KMeans(n_clusters = 2)
kmeans.fit(X_train,y_train)
print(kmeans.score(X_train,y_train))
print(kmeans.score(X_test,y_test))



-2.1619570933868855e+21
-1.480360295252501e+20


In [51]:
kmeans.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [52]:
print(classification_report(y_test,kmeans.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92       106
           1       0.33      0.06      0.10        18

    accuracy                           0.85       124
   macro avg       0.60      0.52      0.51       124
weighted avg       0.78      0.85      0.80       124



# X scaler and Y Algorithm are best performing for this data