In [61]:
import pandas as  pd
import matplotlib as mpl
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from imblearn.over_sampling import SMOTE
import pickle 
  


In [62]:

def objectsToNumbers(columns,df):
    def convert(numStr):
        if type(numStr)is float :
            return numStr
        numStr=numStr.replace(',','.')
        return float(numStr)

    for col in columns:
        df[col]=df[col].apply(convert)
        

In [63]:
def imputeWithMode(columns,df):
    for col in columns:
        df[col].fillna(df[col].mode()[0],inplace=True)
    
def imputeWithMean(columns,df):
    for col in columns:
        df[col].fillna(df[col].mean(),inplace=True)
        
    

    

In [64]:
def labelEncoding(train,test,prediction=False):
    categorical_feature_mask = train.dtypes==object
    categorical_cols = train.columns[categorical_feature_mask].tolist()    
    le=None
    models={}
    for col in categorical_cols:
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
        if (prediction and col=="classLabel"):
            continue
        test[col] = le.transform(test[col])
        models[col]=le


    #pickle.dump(models, open("label_encoder.pckl", 'wb'))
    
    return categorical_cols 
    


In [65]:
def oneHotEncoding(train,test,categorical_cols,classLabel):
    sol=[]
    for col in categorical_cols:
        sol.append(len(train[col].unique()))
    
    categorical_cols.remove(classLabel)
    models=[]
    for col in categorical_cols:
        oe=OneHotEncoder(categories='auto')
        x=oe.fit_transform(train[col].values.reshape(-1,1)).toarray()
        dfOneHot = pd.DataFrame(x, columns = [col+"_"+str(int(i)) for i in range(x.shape[1])])
        train = pd.concat([train, dfOneHot], axis=1)
    
        x=oe.transform(test[col].values.reshape(-1,1)).toarray()
        dfOneHot = pd.DataFrame(x, columns = [col+"_"+str(int(i)) for i in range(x.shape[1])])
        test = pd.concat([test, dfOneHot], axis=1)

        models.append((col,oe))

    
    #pickle.dump(models, open("one_hot.pckl", 'wb'))
        
    return train,test,categorical_cols

    

In [66]:
def overSampling(train,classLabel):
    ## Smote
    oldColumns=train.columns
    oldColumns=oldColumns.drop(classLabel)
    oldColumns=oldColumns.append(pd.Index([classLabel]))
    sm = SMOTE(sampling_strategy='minority', random_state=7)
    oversampled_trainX, oversampled_trainY = sm.fit_sample(train.drop(classLabel, axis=1), train[classLabel])
    train = pd.concat([ pd.DataFrame(oversampled_trainX),pd.DataFrame(oversampled_trainY)], axis=1)
    train.columns = oldColumns
    
    return train

# Exploration

In [67]:
train = pd.read_csv("training.csv",sep=';')
test = pd.read_csv("validation.csv",sep=';')
    
train.head(30)

print (train["variable6"].unique())


['c' 'k' 'ff' 'i' 'j' 'q' 'W' 'd' 'm' 'cc' 'aa' 'r' 'x' 'e' nan]


In [68]:
######## see which data have been read as numric or categorial data type
categorical_feature_mask = train.dtypes==object
train.dtypes


variable1      object
variable2      object
variable3      object
variable4      object
variable5      object
variable6      object
variable7      object
variable8      object
variable9      object
variable10     object
variable11      int64
variable12     object
variable13     object
variable14    float64
variable15      int64
variable17    float64
variable18     object
variable19      int64
classLabel     object
dtype: object

by viewing the data we see that variable2,3 and 8 are numbers but it have been read as objects (categorial so we must fix that)

In [69]:
# so we will convert these columns to numbers
columns=["variable2" ,"variable3","variable8"]
objectsToNumbers(columns,train)
objectsToNumbers(columns,test)

train.head(30)
test.head(30)

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,b,32.33,0.00075,u,g,e,bb,1.585,t,f,0,t,s,420.0,0,4200000.0,,1,no.
1,b,23.58,0.000179,u,g,c,v,0.54,f,f,0,t,g,136.0,1,1360000.0,,0,no.
2,b,36.42,7.5e-05,y,p,d,v,0.585,f,f,0,f,g,240.0,3,2400000.0,,1,no.
3,b,18.42,0.001042,y,p,aa,v,0.125,t,f,0,f,g,120.0,375,1200000.0,,0,no.
4,b,24.5,0.001334,y,p,aa,v,0.04,f,f,0,t,g,120.0,475,1200000.0,f,1,no.
5,a,39.08,0.0004,u,g,c,v,3.0,f,f,0,f,g,480.0,0,4800000.0,f,0,no.
6,b,23.42,0.0001,u,g,c,v,0.5,f,f,0,t,s,280.0,0,2800000.0,,1,no.
7,b,29.58,0.000475,u,g,m,v,2.0,f,t,1,t,g,460.0,68,4600000.0,t,0,no.
8,b,27.42,0.00125,u,g,aa,bb,0.25,f,f,0,t,g,720.0,0,7200000.0,f,1,no.
9,b,58.58,0.000271,u,g,c,v,2.415,f,f,0,t,g,320.0,0,3200000.0,,0,no.


In [70]:
###### see basics about the data
train.describe()

Unnamed: 0,variable2,variable3,variable8,variable11,variable14,variable15,variable17,variable19
count,3661.0,3700.0,3700.0,3700.0,3600.0,3700.0,3600.0,3700.0
mean,32.820713,0.000585,3.439496,4.16,162.695,2246.705946,1626950.0,0.925405
std,12.666181,0.00054,4.335229,6.750553,156.045682,8708.571126,1560457.0,0.262772
min,13.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,0.00015,0.5,0.0,0.0,0.0,0.0,1.0
50%,28.67,0.000425,1.75,2.0,120.0,113.0,1200000.0,1.0
75%,40.83,0.000963,5.0,6.0,280.0,1059.75,2800000.0,1.0
max,80.25,0.0028,28.5,67.0,1160.0,100000.0,11600000.0,1.0


In [71]:
print("total number of rows")
print(train.shape[0])
print("number of rows have  null values ")
print(train.shape[0]-(train.dropna(how="any")).shape[0])

# number of null rows is large so we can't simply delete these rows


total number of rows
3700
number of rows have  null values 
2237


In [72]:
# number of rows that is all its values are null
print ("total number of rows")
print(test.shape[0])
print ("number of null rows ")
print(test.shape[0]-(test.dropna(how="any")).shape[0])


total number of rows
200
number of null rows 
116


In [73]:

# see which columns have missing values

train.isnull().sum()



variable1       39
variable2       39
variable3        0
variable4       64
variable5       64
variable6       66
variable7       66
variable8        0
variable9        0
variable10       0
variable11       0
variable12       0
variable13       0
variable14     100
variable15       0
variable17     100
variable18    2145
variable19       0
classLabel       0
dtype: int64

In [74]:
test.isnull().sum()

variable1       3
variable2       3
variable3       0
variable4       2
variable5       2
variable6       3
variable7       3
variable8       0
variable9       0
variable10      0
variable11      0
variable12      0
variable13      0
variable14      3
variable15      0
variable17      3
variable18    111
variable19      0
classLabel      0
dtype: int64

 we see that all columns have few null of nulls except variable18 (null ratio more than 50%)

- so i will handle variable 18 later but now other variables will be imputed (numeric values with mean() , string values with mode())


In [75]:
modes=["variable1","variable4","variable5","variable6","variable7"]

imputeWithMode(modes,train)
imputeWithMode(modes,test)
    
modes=["variable14","variable17","variable2"]
imputeWithMean(modes,train)
imputeWithMean(modes,test)    


train.isnull().sum()


variable1        0
variable2        0
variable3        0
variable4        0
variable5        0
variable6        0
variable7        0
variable8        0
variable9        0
variable10       0
variable11       0
variable12       0
variable13       0
variable14       0
variable15       0
variable17       0
variable18    2145
variable19       0
classLabel       0
dtype: int64

### Now see correlaion between columns

In [76]:


corr=train.apply(lambda x: x.factorize()[0]).corr()
corr

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
variable1,1.0,0.058529,-0.001978,0.044526,0.044526,-0.059088,0.050991,0.116893,-0.056598,0.073279,-0.029171,-0.13527,0.005259,0.168331,-0.019284,0.168331,0.01278,-0.035487,-0.035487
variable2,0.058529,1.0,0.274491,-0.039123,-0.039123,-0.042639,-0.016691,0.097712,0.019879,-0.091633,0.100313,0.010094,0.044087,0.144707,0.1459,0.144707,0.004902,0.057925,0.057925
variable3,-0.001978,0.274491,1.0,-0.0738,-0.0738,0.072217,-0.013799,0.146094,0.059148,-0.109798,0.188087,0.039457,-0.015757,0.032371,0.174686,0.032371,-0.015036,0.069974,0.069974
variable4,0.044526,-0.039123,-0.0738,1.0,1.0,-0.008422,-0.03444,-0.17111,-0.214103,0.166765,-0.104309,-0.17114,0.144363,0.04286,-0.086077,0.04286,0.116571,-0.102442,-0.102442
variable5,0.044526,-0.039123,-0.0738,1.0,1.0,-0.008422,-0.03444,-0.17111,-0.214103,0.166765,-0.104309,-0.17114,0.144363,0.04286,-0.086077,0.04286,0.116571,-0.102442,-0.102442
variable6,-0.059088,-0.042639,0.072217,-0.008422,-0.008422,1.0,0.035999,0.177944,0.140957,-0.051057,0.003959,-0.081705,-0.173708,0.142643,0.000855,0.142643,0.009578,0.080151,0.080151
variable7,0.050991,-0.016691,-0.013799,-0.03444,-0.03444,0.035999,1.0,0.093523,0.050948,-0.008675,0.109298,0.030605,-0.0784,0.016457,-0.080752,0.016457,-0.022485,0.04757,0.04757
variable8,0.116893,0.097712,0.146094,-0.17111,-0.17111,0.177944,0.093523,1.0,0.241325,-0.20974,0.225857,-0.021079,-0.133712,0.140333,0.143299,0.140333,0.004149,0.158145,0.158145
variable9,-0.056598,0.019879,0.059148,-0.214103,-0.214103,0.140957,0.050948,0.241325,1.0,-0.399801,0.275668,-0.063239,-0.392016,-0.004386,0.142053,-0.004386,-0.108476,0.545244,0.545244
variable10,0.073279,-0.091633,-0.109798,0.166765,0.166765,-0.051057,-0.008675,-0.20974,-0.399801,1.0,-0.551843,0.01027,0.300049,-0.012774,-0.334691,-0.012774,0.181617,-0.218894,-0.218894


In [77]:
corr=test.apply(lambda x: x.factorize()[0]).corr()
corr

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
variable1,1.0,-0.023352,0.093426,-0.070427,-0.070427,0.062747,0.16055,0.093942,-0.018021,0.069544,0.081673,1.483598e-17,0.126668,-0.097159,0.024974,-0.097159,0.064157,-0.066815,-0.067873
variable2,-0.02335204,1.0,0.360155,-0.100927,-0.100927,0.098805,0.058011,0.393483,-0.136146,0.100784,0.178788,0.01564931,0.048764,0.446654,0.404003,0.446654,0.066912,-0.023474,0.203301
variable3,0.09342599,0.360155,1.0,-0.004177,-0.004177,0.170251,0.254666,0.405679,-0.15095,0.14358,0.180989,-0.02049228,0.047674,0.187193,0.200487,0.187193,0.093758,-0.043957,0.214476
variable4,-0.07042667,-0.100927,-0.004177,1.0,1.0,0.035031,-0.013618,-0.160119,0.098184,-0.128449,-0.106321,0.1046299,-0.00304,-0.117275,-0.136625,-0.117275,0.002184,-0.127881,-0.158147
variable5,-0.07042667,-0.100927,-0.004177,1.0,1.0,0.035031,-0.013618,-0.160119,0.098184,-0.128449,-0.106321,0.1046299,-0.00304,-0.117275,-0.136625,-0.117275,0.002184,-0.127881,-0.158147
variable6,0.06274691,0.098805,0.170251,0.035031,0.035031,1.0,0.179394,0.013294,-0.113035,0.117786,0.173325,0.01420025,0.096357,0.059191,0.070445,0.059191,0.09017,-0.045441,0.178708
variable7,0.1605498,0.058011,0.254666,-0.013618,-0.013618,0.179394,1.0,0.186284,-0.175899,0.216433,0.209348,0.09731705,0.047125,-0.00274,0.099188,-0.00274,0.048528,-0.183821,0.295379
variable8,0.09394202,0.393483,0.405679,-0.160119,-0.160119,0.013294,0.186284,1.0,-0.347467,0.249841,0.282955,-0.06275823,0.051231,0.2028,0.271546,0.2028,0.086131,0.005274,0.29441
variable9,-0.01802131,-0.136146,-0.15095,0.098184,0.098184,-0.113035,-0.175899,-0.347467,1.0,-0.522323,-0.452192,0.01011443,-0.039672,-0.16502,-0.180424,-0.16502,-0.142513,-0.030343,-0.720398
variable10,0.06954358,0.100784,0.14358,-0.128449,-0.128449,0.117786,0.216433,0.249841,-0.522323,1.0,0.70875,0.08006408,0.218518,0.09833,0.358386,0.09833,0.245677,0.040032,0.56905


### From correlation analysic

-  we recognize (variable4 and varaible5) are totally correlated in both test and train data set
-  we recognize (varaible17 and varaible14) are totally correlated in both test and train data set
<br>
so we will remove variable 5  and varaible17

----------------------------------------
-  we recognize (varaible18) has a weak correlaction with label and also has many Nan Values (more than 50%) and also i don't have other information to decide if this data is important or not
<br>
so we will remove variable18

-----------------------------------------

-  we recognize classLabel and varaible19 are totally correlated in trian data set  but not in test so there is something wrong and can affect classification accuracy learning
<br>
so i will delete the column

In [78]:
columns = ["variable5","variable17","variable18","variable19"]

train.drop(columns, axis=1, inplace=True)
test.drop(columns, axis=1, inplace=True)


In [79]:
 # CHECK BALANCING 
labels=train["classLabel"].unique()
for i in labels:
    print (i ,(train["classLabel"]==i).sum())
    

no. 276
yes. 3424


so training data set are not balanced and that will affect the precision and recall of the model this can be solved 
using oversampling or SMOTE technique

# ********** ******** ************** **************
# Preprocessing
# ********** ******** ************** **************
- Read Data correctly (read variable2,3,8 14,15,17 and 19 correctly as numeric values)
- imputation of variable1,4,5,6,7 with mode value (because they are categorial variables)
- imputation of variable2,14,17 with mean value (because they are numeric variables)
- remove variable18 columns (because its have null values greater than 50% and also has strong correlation with variable10)
- remove variable5 because it is totally correlated with variable4
- remove variable17 because it is totally correlated with variable14
- remove variable19 because it can lead to missclassification (totally correlated with Class label in train data set but not in test data set)
- because the data train data is unbalanced and this can led to high FalsePositive or high FalseNegative so we will use SMOTE technique to generate synthetic samples (so size of training data set will increase)
- because the data have many categorial features and we must encode them to number so i will encode them using oneHotencoding technique (better than label encoding because the categorial data we have don't have order relationship)


In [80]:

train = pd.read_csv("training.csv",sep=';')
test = pd.read_csv("validation.csv",sep=';')
   
## read numerical data correctly
columns=["variable2" ,"variable3","variable8"]
objectsToNumbers(columns,train)
objectsToNumbers(columns,test)

## missing value imputation
columns=["variable1","variable4","variable5","variable6","variable7"]
imputeWithMode(columns,train)
imputeWithMode(columns,test)
    
columns=["variable14","variable17","variable2"]
imputeWithMean(columns,train)
imputeWithMean(columns,test)    

## remove unwanted columns
columns = ["variable5","variable17","variable18","variable19"]
train.drop(columns, axis=1, inplace=True)
test.drop(columns, axis=1, inplace=True)

print ("Dealing with missing values and unwanted columns finished")


## label encoding
categoryColumns=labelEncoding(train,test)


## one Hot Encodinf
train,test,categoryColumns= oneHotEncoding(train,test,categoryColumns,"classLabel")

## remove original columns
train.drop(categoryColumns, axis=1, inplace=True)
test.drop(categoryColumns, axis=1, inplace=True)

print(train)

print("Encoding finished")


# oversampling the training data
train=overSampling(train,"classLabel")

print("OverSampling (SMOTE) finished")


# save the output
train.to_csv("training_processed.csv",sep=';')
test.to_csv("validation_processed.csv",sep=';')

train



Dealing with missing values and unwanted columns finished
      variable2  variable3  variable8  variable11  variable14  variable15  \
0     17.920000   0.000054      1.750           1      80.000           5   
1     16.920000   0.000034      0.290           0     200.000           0   
2     31.250000   0.000112      0.000           1      96.000          19   
3     48.170000   0.000133      0.335           0       0.000         120   
4     32.330000   0.000350      0.500           0     232.000           0   
5     34.830000   0.000125      0.500           0     160.000           0   
6     26.170000   0.000200      0.000           0     276.000           1   
7     21.170000   0.000087      0.250           0     280.000         204   
8     28.920000   0.000037      0.290           0     220.000         140   
9     18.170000   0.001025      1.085           0     320.000          13   
10    24.750000   0.001367      1.500           0     280.000           1   
11    31.750000   

Unnamed: 0,variable2,variable3,variable8,variable11,variable14,variable15,variable1_0,variable1_1,variable4_0,variable4_1,...,variable9_0,variable9_1,variable10_0,variable10_1,variable12_0,variable12_1,variable13_0,variable13_1,variable13_2,classLabel
0,17.920000,0.000054,1.750000,1.000000,80.000000,5.000000,1.000000,0.000000,0.0,1.000000,...,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000,1.000000,0.000000,0.000000,0
1,16.920000,0.000034,0.290000,0.000000,200.000000,0.000000,0.000000,1.000000,0.0,0.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0
2,31.250000,0.000112,0.000000,1.000000,96.000000,19.000000,0.000000,1.000000,0.0,1.000000,...,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0
3,48.170000,0.000133,0.335000,0.000000,0.000000,120.000000,1.000000,0.000000,0.0,1.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0
4,32.330000,0.000350,0.500000,0.000000,232.000000,0.000000,0.000000,1.000000,0.0,1.000000,...,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,0
5,34.830000,0.000125,0.500000,0.000000,160.000000,0.000000,1.000000,0.000000,0.0,0.000000,...,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,0
6,26.170000,0.000200,0.000000,0.000000,276.000000,1.000000,1.000000,0.000000,0.0,1.000000,...,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,0
7,21.170000,0.000087,0.250000,0.000000,280.000000,204.000000,0.000000,1.000000,0.0,0.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0
8,28.920000,0.000037,0.290000,0.000000,220.000000,140.000000,0.000000,1.000000,0.0,1.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0
9,18.170000,0.001025,1.085000,0.000000,320.000000,13.000000,0.000000,1.000000,0.0,1.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0


In [81]:
print ("\nPreprocessing finished \n")


Preprocessing finished 

