### Importing Libraries 

In [None]:
#Declare all libraries and their uses here

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing, svm
from sklearn.preprocessing import OneHotEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, cross_val_score

### Loading Dataset

In [None]:
df = pd.read_csv('train.csv')

### Analysing the data

Viewing data types and non-null counts of each column of the data

In [None]:
df.info()

Viewing all the counts of each unique value in each column

In [None]:
column_names = df.columns

for column in column_names:
    print(df[column].value_counts())
    print("\n************\n")

Identify any columns having null values

In [None]:
null_df = df.isnull().sum().to_frame()
null_df.loc[null_df[0]!=0]

### Identify and drop columns mostly null values 
As columns having more than 70% missing values should have no significant effect on the target column

In [None]:
majority_null = ['s59','s57','s56','s55','s54']
df.drop(majority_null, axis = 1, inplace= True)

### Encoding

Label encoding for binary categorical data. (Only two options eg: M/F, Y/N). Identified from column value counts. Done using scikit.learn library.

In [None]:
# Creating instance of label encoder from scikit.learn
label_encoder = preprocessing.LabelEncoder()

# columns to encode
label_encode = ['gender','s11','s12','s53','s58']

# use encoder on each of the selected columns
for column in label_encode:
    df[column] = label_encoder.fit_transform(df[column])  

Use one-hot encoding instead of label encoding for non-binary categories, as it is unclear whether data is ordinal or not, to avoid unintentional ranking and therfore bias. Done using get_dummies from pandas library

In [None]:
# columns to encode
one_hot_encode = ['s16','s17','s18','s69','s70','s71']

# create dummy columns for each possible value 
# in each of the columns 
# (auto prefixed with original column name)
ohe_df = pd.get_dummies(df[one_hot_encode])

# Drop each of the original columns in the original dataframe 
# as they are now encoded
df = df.drop(columns=one_hot_encode, axis = 1)

# Join the dataframe with the encoded columns 
# to the original dataframe
df = df.join(ohe_df)

In [None]:
replacement_mapping_dict = {
    "l": "1",
    "o": "0"
}
df['s52'] = df['s52'].apply(lambda x: x.replace(replacement_mapping_dict, regex=True))

In [None]:
# you can do multiple replacements in within one call of the replace method by creating a mapping dictionary
# very scalable woo

df['s52'].value_counts()

### Look at the encoded categorical data

In [None]:
column_names = df.columns

for column in column_names:
    print(df[column].value_counts())
    print("\n************\n")

In [None]:
#Moving Target column 'label' to the right
df=df[[c for c in df if c not in ['label']] 
       + ['label']]

In [None]:
df_test = df[['s16_A', 's16_B', 's16_C', 's16_D']]
df_test_2 =df[['s17_A','s17_B', 's17_C', 's17_D']]
df_test_3 =df[['s18_A', 's18_B', 's18_C', 's18_D']]
df_test_4 =df[['s69_0','s69_C`', 's69_x', 's69_~1']]
df_test_5 =df[[ 's70_op: A', 's70_op: B', 's70_op: C','s70_op: D']]
df_test_6 =df[['s71_a', 's71_b', 's71_c', 's71_d']]

In [None]:
#Variance Inflation Factor function to check for multicollinearity
def variance(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    
    # calculating VIF for each feature
    vif_data["VIF"] = [variance_inflation_factor(df.values, i)
                            for i in range(len(df.columns))]
    print(vif_data)  


In [None]:
# sns.set(rc={'figure.figsize':(11.7,8.27)})

# dataplot = sns.heatmap(df_test, cmap="YlGnBu", annot = True)
  
# displaying heatmap
# plt.show()

In [None]:
df_desc= df[['n1','n2','n4','n5','n6','n7','n8','n9','n10','n11','n14']]
normalized_df=(df_desc-df_desc.mean())/df_desc.std()

In [None]:
df_2 = df[[ 'gender', 's11', 's12', 's13', 's48', 's52', 's53', 's58', 'n1',
       'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12',
       'n13', 'n14', 'n15', 's16_A', 's16_B', 's16_C', 's16_D', 's17_A',
       's17_B', 's17_C', 's17_D', 's18_A', 's18_B', 's18_C', 's18_D', 's69_0',
       's69_C`', 's69_x', 's69_~1', 's70_op: A', 's70_op: B', 's70_op: C',
       's70_op: D', 's71_a', 's71_b', 's71_c', 's71_d']]
       
df_2[['n1','n2','n4','n5','n6','n7','n8','n9','n10','n11','n14']] = normalized_df[['n1','n2','n4','n5','n6','n7','n8','n9','n10','n11','n14']]

In [None]:
df_2.info()

In [None]:
X= df_2
y= df['label']

In [None]:
clf = svm.SVC(kernel='linear', C=1).fit(X, y) # X= features of training set, y= target value of training set

In [None]:
## Clean test data

In [None]:
clf.score(X, y)

In [None]:
scores = cross_val_score(clf, X, y, cv=5)

In [None]:
scores