In [92]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, SelectKBest
import pandas as pd

from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

In [93]:
df=pd.read_csv('raw copy.csv')

In [94]:
df.shape

(4925, 30)

In [95]:
df['disease'].value_counts()

disease
0    3612
1    1313
Name: count, dtype: int64

In [96]:
x,y=train_test_split(df, test_size=0.2)

In [97]:
x['disease'].value_counts()

disease
0    2894
1    1046
Name: count, dtype: int64

In [98]:
y_train=x['disease']
x_train=x.drop('disease',axis=1)

In [99]:
y_test=y['disease']
x_test=y.drop('disease',axis=1)

In [100]:
x_test.shape

(985, 29)

In [101]:
x_train.shape

(3940, 29)

In [102]:
y_train.value_counts()

disease
0    2894
1    1046
Name: count, dtype: int64

In [103]:
def oversample_data(x, y):
    class_weight={0: y.value_counts()[0], 1:int((y.value_counts()[0])*0.7)}
    print(class_weight)
    smote=SMOTE(sampling_strategy=class_weight,k_neighbors=4)
    smote=SMOTETomek(sampling_strategy='not minority', random_state=0,smote=smote)

    ros=RandomOverSampler(sampling_strategy='not majority')
    x,y=smote.fit_resample(x,y)
    
    return x,y

In [104]:
print('Original dataset shape %s' % Counter(y_train))
# x,y=smote.fit_resample(final_dataset.iloc[:,:-1],final_dataset['disease'])

x,y=oversample_data(x_train,y_train)

# x,y=us.fit_resample(final_dataset.iloc[:,:-1],final_dataset['disease'])
print('Resampled dataset shape %s' % Counter(y))

Original dataset shape Counter({0: 2894, 1: 1046})
{0: 2894, 1: 2025}
Resampled dataset shape Counter({0: 2854, 1: 1985})


In [105]:
# x_train=pd.DataFrame(x)
# y_train=pd.DataFrame(y)

In [106]:
def best_feature(x,y):
    best_feature=SelectKBest(mutual_info_classif,k=5)
    best_feature.fit(x,y)
    feature_names=x.columns[best_feature.get_support()]
    return feature_names

In [107]:
feature_names=best_feature(x_train,y_train)

In [108]:
x_train=x_train[feature_names]
y_train=y_train

In [109]:
x_test=x_test[feature_names]
y_test=y_test

In [110]:
x_train

Unnamed: 0,fti,t3,tbg,tsh,tt4
1421,17.0,0.60,25.0,55.000,15.0
1765,101.0,2.40,24.4,1.000,107.0
428,71.0,1.90,25.0,12.542,81.0
2946,77.0,2.50,25.0,2.300,78.0
3614,77.0,2.18,25.0,19.000,93.0
...,...,...,...,...,...
495,101.0,1.70,24.4,0.200,85.0
1784,48.0,1.60,25.0,29.000,47.0
2437,112.0,2.00,24.4,20.960,96.0
1501,152.0,1.60,24.4,0.200,122.0


In [111]:
x_test

Unnamed: 0,fti,t3,tbg,tsh,tt4
4405,103.0,2.30,24.4,1.000000,116.0
2861,4.0,0.50,25.8,500.000000,5.0
4423,122.0,1.30,24.4,2.100000,109.0
3806,130.0,2.30,24.4,2.900000,192.0
1633,90.0,1.38,24.4,9.100000,101.0
...,...,...,...,...,...
2222,101.0,1.90,24.4,14.299999,111.0
4420,124.0,2.06,24.4,0.500000,126.0
2847,64.0,2.18,25.0,49.000000,87.0
168,163.0,2.00,24.4,0.005000,147.0


In [112]:
def transform(X):
    scale=StandardScaler()
    trans_x=pd.DataFrame(scale.fit_transform(X), columns=X.columns)
    return trans_x

In [113]:
x_train=transform(x_train)

In [114]:
x_test=transform(x_test)

In [115]:
x_train.head()

Unnamed: 0,fti,t3,tbg,tsh,tt4
0,-1.921949,-1.475043,1.00103,1.11821,-2.062293
1,-0.185984,0.362295,-0.216961,-0.275671,0.005496
2,-0.805972,-0.148077,1.00103,0.022259,-0.578879
3,-0.681974,0.464369,1.00103,-0.242114,-0.646307
4,-0.681974,0.137731,1.00103,0.188956,-0.309168


In [116]:
x_train.to_csv('temp_train_data.csv',index=False)
y_train.to_csv('temp_output_train_data.csv',index=False)

In [117]:
x_test.to_csv('temp_test_data.csv',index=False)
y_test.to_csv('temp_output_test_data.csv',index=False)