In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import itertools
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap.umap_ import UMAP

In [90]:
dataset_df = pd.read_csv(r"C:\Users\ghibl\ICR\data\input\train.csv")
new_column_names = {col:col.replace(" ","") for col in dataset_df.columns}
dataset_df.rename(columns = new_column_names, inplace = True)
X_df = dataset_df.drop("Id", axis = 1)

In [91]:
X_df.columns[:100]

Index(['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD', 'BN', 'BP',
       'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU',
       'CW', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB',
       'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD', 'FE', 'FI', 'FL',
       'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class'],
      dtype='object')

Do labelencoding for EJ column

In [92]:
le = LabelEncoder()
X_df["EJ"] = le.fit_transform(X_df["EJ"])

In [93]:
features_df = X_df.copy()

Relation between missing values and class in BQ and EL

In [94]:
miss_df = dataset_df[["BQ","EL","Class"]].copy()
miss_df["bq_na"] = miss_df["BQ"].isnull()
miss_df["el_na"] = miss_df["EL"].isnull()

bq_pivot = miss_df.pivot_table(index = "bq_na",columns = "Class", values = None,aggfunc = 'count')
el_pivot = miss_df.pivot_table(index = "el_na",columns = "Class", values = None,aggfunc = 'count')

print(bq_pivot)
print(el_pivot)

          BQ            EL         el_na       
Class      0      1      0      1      0      1
bq_na                                          
False  449.0  108.0  448.0  102.0  449.0  108.0
True     0.0    NaN    7.0    NaN   60.0    NaN
        BQ        EL      bq_na     
Class    0    1    0    1     0    1
el_na                               
False  448  102  455  102   455  102
True     1    6    0    0    54    6


complement missing values with median or average


In [95]:
null_columns =features_df.columns[features_df.isnull().any()]
null_columns = [i for i in null_columns if not i in ["BQ","EL"]]

for column in null_columns:
    median = features_df[column].median()
    features_df[column] = features_df[column].fillna(median)

# Predict missing values by KNN

BQ

In [96]:
#split the dataset to trainset and testset
bq_df = features_df.copy().drop(["Class","EL"],axis = 1)
null_index = bq_df[bq_df["BQ"].isnull()].index
train_bq = bq_df.drop(null_index,axis = 0)
test_bq = bq_df.iloc[null_index,:]

In [97]:
X_train_bq ,y_train_bq = train_bq.drop("BQ",axis = 1), train_bq["BQ"]
X_test_bq = test_bq.drop("BQ",axis = 1)

knn = KNeighborsRegressor(n_neighbors = 4)
knn.fit(X_train_bq,y_train_bq)
y_pred_bq = knn.predict(X_test_bq)

In [98]:
y_pred_bq = pd.Series(y_pred_bq,index = X_test_bq.index)
features_df["BQ"] = features_df["BQ"].fillna(y_pred_bq)

EL

In [99]:
el_df = features_df.copy().drop(["Class","BQ"],axis = 1)
null_index = el_df[el_df["EL"].isnull()].index
train_el = el_df.drop(null_index,axis = 0)
test_el = el_df.iloc[null_index,:]

In [100]:
X_train_el, y_train_el = train_el.drop("EL",axis = 1), train_el["EL"]
X_test_el = test_el.drop("EL",axis = 1)

knn = KNeighborsRegressor(n_neighbors = 4)
knn.fit(X_train_el,y_train_el)
y_pred_el = knn.predict(X_test_el)

In [101]:
y_pred_el = pd.Series(y_pred_el,index = X_test_el.index)
features_df["EL"] = features_df["EL"].fillna(y_pred_el)

In [102]:
print(features_df.shape)
features_df.isnull().sum()

(617, 57)


AB       0
AF       0
AH       0
AM       0
AR       0
AX       0
AY       0
AZ       0
BC       0
BD       0
BN       0
BP       0
BQ       0
BR       0
BZ       0
CB       0
CC       0
CD       0
CF       0
CH       0
CL       0
CR       0
CS       0
CU       0
CW       0
DA       0
DE       0
DF       0
DH       0
DI       0
DL       0
DN       0
DU       0
DV       0
DY       0
EB       0
EE       0
EG       0
EH       0
EJ       0
EL       0
EP       0
EU       0
FC       0
FD       0
FE       0
FI       0
FL       0
FR       0
FS       0
GB       0
GE       0
GF       0
GH       0
GI       0
GL       0
Class    0
dtype: int64

Save this no missing value dataframe into csv file

In [103]:
complemented_df = pd.concat([dataset_df["Id"],features_df],axis = 1)
complemented_df.to_csv(r'C:\Users\ghibl\ICR\data\input\processed_train.csv',index = False)

In [104]:
complemented_df.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


Standardize basic features

In [105]:
processed_df = features_df.copy()
num_cols = list(processed_df.columns)
num_cols = [i for i in num_cols if not i in ["EJ","Class"]]
sc = StandardScaler()
processed_df[num_cols] = sc.fit_transform(processed_df[num_cols])
processed_df_csv = pd.concat([dataset_df["Id"],processed_df],axis = 1)
processed_df_csv.to_csv(r'C:\Users\ghibl\ICR\data\input\processed_std_train.csv',index = False)

In [106]:
processed_df.drop("Class",axis = 1,inplace = True)

In [107]:
processed_df.isnull().sum()

AB    0
AF    0
AH    0
AM    0
AR    0
AX    0
AY    0
AZ    0
BC    0
BD    0
BN    0
BP    0
BQ    0
BR    0
BZ    0
CB    0
CC    0
CD    0
CF    0
CH    0
CL    0
CR    0
CS    0
CU    0
CW    0
DA    0
DE    0
DF    0
DH    0
DI    0
DL    0
DN    0
DU    0
DV    0
DY    0
EB    0
EE    0
EG    0
EH    0
EJ    0
EL    0
EP    0
EU    0
FC    0
FD    0
FE    0
FI    0
FL    0
FR    0
FS    0
GB    0
GE    0
GF    0
GH    0
GI    0
GL    0
dtype: int64

# PCA

In [108]:
pca = PCA(n_components=5)
pca_df = pd.DataFrame(pca.fit_transform(processed_df),
                      columns = ["pca_Component1","pca_Component2","pca_Component3","pca_Component4","pca_Component5"])

In [109]:
kmeans = KMeans(n_clusters = 2, random_state = 42)
kmeans.fit(pca_df)
features_clusters = kmeans.predict(pca_df)
pca_distances = kmeans.transform(pca_df)
pca_distances_df = pd.DataFrame(pca_distances, columns = ["pca_distance1","pca_distance2"])
pca_df["pca_Cluster"] = features_clusters
pca_df = pd.concat([pca_df,pca_distances_df],axis =1)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [110]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = pca_df["pca_Component1"], y = pca_df["pca_Component2"],mode = "markers",
                        marker = dict(color = pca_df["pca_Cluster"])))

In [111]:
fig = go.Figure()
fig.add_trace(go.Scatter3d(x = pca_df["pca_Component1"], y = pca_df["pca_Component2"],z = pca_df["pca_Component3"],
                         mode = "markers",marker = dict(color = pca_df["pca_Cluster"])))

In [112]:
features_df = pd.concat([features_df,pca_df.drop("pca_Cluster",axis = 1)],axis = 1)

In [113]:
features_df

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,GI,GL,Class,pca_Component1,pca_Component2,pca_Component3,pca_Component4,pca_Component5,pca_distance1,pca_distance2
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,69.834944,0.120343,1,-1.738186,-1.109674,1.198680,0.861387,0.212835,2.496359,41.978149
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,32.131996,21.978000,0,-1.441891,-1.360396,-0.898627,-0.314309,0.677060,2.262527,41.708462
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,35.192676,0.196941,0,0.072948,0.433984,0.720709,1.318471,-1.222513,1.985286,41.057601
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,90.493248,0.155829,0,-0.359204,0.262911,0.684023,0.887929,-1.511198,1.909004,41.387953
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,36.262628,0.096614,1,-0.524386,0.788124,0.094474,1.568198,0.179713,1.792448,41.716265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,69.191944,21.978000,0,-1.296265,-1.154622,-0.530270,-0.681719,0.463395,1.951633,41.655074
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,124.808872,0.145340,0,1.064404,1.496182,0.800727,-3.428718,0.722264,4.046194,40.886041
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,119.559420,21.978000,0,0.454714,-0.162248,-0.720338,-3.770568,2.154255,4.441790,40.782523
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,37.155112,0.184622,0,-0.892914,-0.740525,0.179983,0.248897,0.008872,1.155653,41.399879


# t-SNE

In [114]:
tsne = TSNE(n_components = 2, random_state = 42)
tsne_df = pd.DataFrame(tsne.fit_transform(processed_df),
                      columns = ["tsne_Component1","tsne_Component2"])

In [115]:
kmeans.fit(tsne_df)
features_clusters = kmeans.predict(tsne_df)
tsne_distances = kmeans.transform(tsne_df)
tsne_distances_df = pd.DataFrame(tsne_distances, columns = ["tsne_distance1","tsne_distance2"])
tsne_df["tsne_Cluster"] = features_clusters
tsne_df = pd.concat([tsne_df,tsne_distances_df],axis =1)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [116]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = tsne_df["tsne_Component1"], y = tsne_df["tsne_Component2"],mode = "markers",
                        marker = dict(color = tsne_df["tsne_Cluster"])))

In [117]:
tsne2 = TSNE(n_components = 3, random_state = 42)
tsne_df2 = pd.DataFrame(tsne2.fit_transform(processed_df),
                      columns = ["tsne_Component1","tsne_Component2","tsne_Component3"])

In [118]:
kmeans.fit(tsne_df2)
features_clusters2 = kmeans.predict(tsne_df2)
tsne_df2["tsne_Cluster"] = features_clusters




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [119]:
fig = go.Figure()
fig.add_trace(go.Scatter3d(x = tsne_df2["tsne_Component1"], y = tsne_df2["tsne_Component2"],z = tsne_df2["tsne_Component3"],
                         mode = "markers",marker = dict(color = tsne_df2["tsne_Cluster"])))

In [120]:
features_df = pd.concat([features_df,tsne_df.drop("tsne_Cluster", axis = 1)],axis = 1)

In [121]:
features_df.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,pca_Component2,pca_Component3,pca_Component4,pca_Component5,pca_distance1,pca_distance2,tsne_Component1,tsne_Component2,tsne_distance1,tsne_distance2
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,-1.109674,1.19868,0.861387,0.212835,2.496359,41.978149,9.34421,-8.913967,12.277658,24.441946
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,-1.360396,-0.898627,-0.314309,0.67706,2.262527,41.708462,-15.278951,-3.763439,25.385233,1.173498
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,0.433984,0.720709,1.318471,-1.222513,1.985286,41.057601,0.523973,8.934161,10.216133,20.549118
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,0.262911,0.684023,0.887929,-1.511198,1.909004,41.387953,-5.519585,20.11762,22.228573,26.583408
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,0.788124,0.094474,1.568198,0.179713,1.792448,41.716265,18.15896,-9.161399,15.463756,33.196178


UMAP

In [122]:
um = UMAP(n_components = 2,random_state = 42)
um_df = pd.DataFrame(um.fit_transform(processed_df),
                     columns = ["um_Component1","um_Component2"])

In [123]:
kmeans.fit(um_df)
features_clusters = kmeans.predict(um_df)
um_distances = kmeans.transform(um_df)
um_distances_df = pd.DataFrame(um_distances, columns = ["um_distance1","um_distance2"])
um_df["um_Cluster"] = features_clusters
um_df = pd.concat([um_df,um_distances_df],axis =1)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [124]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = um_df["um_Component1"], y = um_df["um_Component2"],mode = "markers",
                        marker = dict(color = um_df["um_Cluster"])))

In [125]:
um2 = UMAP(n_components = 3,random_state = 42)
um_df2 = pd.DataFrame(um2.fit_transform(processed_df),
                     columns = ["um_Component1","um_Component2","um_Component3"])

In [126]:
kmeans.fit(um_df2)
features_clusters = kmeans.predict(um_df2)
um_distances2 = kmeans.transform(um_df2)
um_distances_df2 = pd.DataFrame(um_distances2, columns = ["um_distance1","um_distance2"])
um_df2["um_Cluster"] = features_clusters
um_df2 = pd.concat([um_df2,um_distances_df2],axis =1)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [127]:
fig = go.Figure()
fig.add_trace(go.Scatter3d(x = um_df2["um_Component1"], y = um_df2["um_Component2"],z = um_df2["um_Component3"],
                         mode = "markers",marker = dict(color = um_df2["um_Cluster"])))

In [128]:
features_df = pd.concat([features_df,um_df.drop("um_Cluster", axis = 1)],axis = 1)

In [129]:
features_df["Class"]

0      1
1      0
2      0
3      0
4      1
      ..
612    0
613    0
614    0
615    0
616    0
Name: Class, Length: 617, dtype: int64

Create columns to store the added and multiplied values for each feature combination

In [130]:
X_df.drop("Class",axis = 1,inplace = True)
columns = list(X_df.columns)
com_columns = list(itertools.combinations(columns, 2))
for com_col in com_columns:
    features_df[f"{com_col[0]}_{com_col[1]}_mul"] = complemented_df[f"{com_col[0]}"] *  complemented_df[f"{com_col[1]}"]
    features_df[f"{com_col[0]}_{com_col[1]}_sum"] = complemented_df[f"{com_col[0]}"] +  complemented_df[f"{com_col[1]}"]


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [131]:
features_df.drop("Class",axis = 1,inplace = True)
features_df_csv = pd.concat([dataset_df["Id"],features_df],axis = 1)
features_df_csv = pd.concat([features_df_csv, dataset_df["Class"]],axis = 1)
features_df_csv.to_csv(r'C:\Users\ghibl\ICR\data\input\all_features.csv',index = False)

In [132]:
features_df.columns[:100]

Index(['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD', 'BN', 'BP',
       'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU',
       'CW', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB',
       'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD', 'FE', 'FI', 'FL',
       'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'pca_Component1',
       'pca_Component2', 'pca_Component3', 'pca_Component4', 'pca_Component5',
       'pca_distance1', 'pca_distance2', 'tsne_Component1', 'tsne_Component2',
       'tsne_distance1', 'tsne_distance2', 'um_Component1', 'um_Component2',
       'um_distance1', 'um_distance2', 'AB_AF_mul', 'AB_AF_sum', 'AB_AH_mul',
       'AB_AH_sum', 'AB_AM_mul', 'AB_AM_sum', 'AB_AR_mul', 'AB_AR_sum',
       'AB_AX_mul', 'AB_AX_sum', 'AB_AY_mul', 'AB_AY_sum', 'AB_AZ_mul',
       'AB_AZ_sum', 'AB_BC_mul', 'AB_BC_sum', 'AB_BD_mul', 'AB_BD_sum',
       'AB_BN_mul', 'AB_BN_sum', 'AB_BP_mul', 'AB_BP_sum', 'AB_BQ_mul',
       'AB