1. **Importing the libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sea

# Set a few plotting defaults
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 18
plt.rcParams['patch.edgecolor'] = 'k'


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')



# Any results you write to the current directory are saved as output.

**2. Importing/exploring the train/test datasets and converting them to numeric form**

In [None]:

import os
print(os.listdir("../input"))

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
print ("Train Dataset: Rows, Columns: ", train_df.shape)
print ("Test Dataset: Rows, Columns: ", test_df.shape)



In [None]:
train_valid = train_df.loc[train_df['parentesco1'] == 1, ['idhogar', 'Id', 'Target']].copy()
test_valid = test_df.loc[test_df['parentesco1'] == 1, ['idhogar', 'Id']].copy()

submission_base = test_df[['Id', 'idhogar']]

In [None]:
#Glimpse at train_df
train_df.head(7)

In [None]:
print (train_df.info())



In [None]:
print ("Summary of Train Dataset: ")
train_df.describe()

In [None]:
#select columns w/ dtype 'object'
train_df.select_dtypes(['object']).head(15)
#total of 5 columns


In [None]:
#droping irrelevant nonumeric columns
train_df = train_df.drop(['Id', 'idhogar'], axis = 1)
test_df = test_df.drop(['Id', 'idhogar'], axis = 1)

In [None]:
#Let's explore the dependency column
train_df['dependency'].unique()

In [None]:
#However, there is a column containing the square values if the dependency, 'SQBdependency'. 
#'yes' goes as 1 and 'no' goes as 0
#Let's convert 'yes' and 'no' in dependency column to make it 100% numeric
train_df['dependency'] = train_df['dependency'].replace(('yes', 'no'), (1, 0))
test_df['dependency'] = test_df['dependency'].replace(('yes', 'no'), (1, 0))
#train_df['dependency']=train_df['dependency'].astype(float)
print (train_df['dependency'].unique())


In [None]:
train_df['edjefe'].unique()


In [None]:
train_df['edjefa'].unique()


In [None]:
#let's convert 'no' to 0 and 'yes' to 1 to make the colums numeric
train_df['edjefa'] = train_df['edjefa'].replace(('yes', 'no'), (1, 0))
train_df['edjefe'] = train_df['edjefe'].replace(('yes', 'no'), (1, 0))
test_df['edjefa'] = test_df['edjefa'].replace(('yes', 'no'), (1, 0))
test_df['edjefe'] = test_df['edjefe'].replace(('yes', 'no'), (1, 0))

In [None]:
train_df['dependency']=train_df['dependency'].astype(float)
train_df['edjefa']=train_df['edjefa'].astype(float)
train_df['edjefe']=train_df['edjefe'].astype(float)
test_df['dependency']=test_df['dependency'].astype(float)
test_df['edjefa']=test_df['edjefa'].astype(float)
test_df['edjefe']=test_df['edjefe'].astype(float)


In [None]:
#double checking that all columns are now numeric
train_df.dtypes.value_counts()


**3. Taking care of the missing values**

In [None]:
#Now let's take care of the missing columns
print ("Top Columns having missing values:")
missing_df = train_df.isnull().sum().to_frame()
missing_df = missing_df.sort_values(0, ascending = False)
missing_df.head()



In [None]:
#'v18q1' - number of tablets household owns
train_df.groupby('v18q')['v18q1'].apply(lambda x: x.isnull().sum())
#Every family that has nan for v18q1 does not own a tablet. 
#Therefore, we can fill in this missing value with zero.

In [None]:
train_df['v18q1'] = train_df['v18q1'].fillna(0)
test_df['v18q1'] = test_df['v18q1'].fillna(0)

In [None]:
#rez_esc - Years behind in school  
print(train_df['rez_esc'].value_counts())
print(train_df['instlevel6'].value_counts())
print (train_df.loc[train_df['rez_esc'].isnull()]['instlevel5'].value_counts())
print (train_df.loc[train_df['rez_esc'].isnull()]['instlevel6'].value_counts())
print (test_df.loc[test_df['rez_esc']==0]['instlevel6'].value_counts())




In [None]:
#There is a good correlation between NA in 'Years behind in school'
#and people with complete academic secondary level
#Let's assign '6' to those people
train_df['rez_esc'] = train_df['rez_esc'].fillna(6)
test_df['rez_esc'] = test_df['rez_esc'].fillna(6)
print(train_df['rez_esc'].value_counts())




In [None]:
#v2a1, Monthly rent payment
print(train_df['v2a1'].unique())


In [None]:
#Let's try to correlate it with 'tipovivi1, =1 own and fully paid house'
print(train_df['tipovivi3'].value_counts())
print (train_df.loc[train_df['v2a1'].isnull()]['tipovivi3'].value_counts())
'''tipovivi1, =1 own and fully paid house, tipovivi2, "=1 own,  paying in installments", 
tipovivi3, =1 rented'''

In [None]:
# - the column is about people not paying for rent - let's investigate it further
#How many of those own a house?
print (train_df.loc[train_df['v2a1'].isnull()]['tipovivi1'].value_counts())



In [None]:
# so 6 out of 7 of 'na' people in that cathegory actually own and fully paid house.
#Let's replace those na with the highest rent value to account for them possesing a house
a = max(train_df['v2a1'].max(), test_df['v2a1'].max())
b = max(train_df['v2a1'].max(), test_df['v2a1'].max())
# Fill in households that own the house with max rent payment (we'll assume they are the richest ones)
train_df.loc[(train_df['tipovivi1'] == 1), 'v2a1'] = a
test_df.loc[(test_df['tipovivi1'] == 1), 'v2a1'] = a
# Values for those with missing rent payment column will be replaced with min rent payment (we'll assume they are the poorest ones)
train_df['v2a1'].fillna(b, inplace = True)
test_df['v2a1'].fillna(b, inplace = True)



In [None]:
print (a)

In [None]:
#the rest of the missing values can be replaced with mean as their percentage towards total number of entries is insignificant
train_df.fillna (train_df.mean(), inplace = True)
test_df.fillna(test_df.mean(), inplace = True)

In [None]:
print ('Columns having missing values:')
train_df.columns[train_df.isnull().any()]

In [None]:
train_df['Target'].unique()

**4. Dataset visuzalisation**

In [None]:
print(train_df.shape[1])

In [None]:
#Let's see how many unique value are in each of the columns:
train_df.nunique().value_counts().sort_index().plot.bar(color = 'blue', figsize = (8, 6),
                                                        edgecolor = 'k', linewidth = 2);
plt.xlabel('Number of Unique Values'); plt.ylabel('Count');
plt.title('Count of Unique Values in Integer Columns');

In [None]:
#let's separate binomial (yes-no, the majority) data from the rest
non_binomial = []
for i in range (0,train_df.shape[1]):
    if len(train_df.iloc[:,i].unique().tolist())>2:
        a = [i]
        non_binomial = non_binomial + a
print ("Non-Boolean Columns:")
print (*non_binomial)
for i in non_binomial:
    p = sea.countplot(data=train_df,x = train_df.iloc[:,i])
    print (i)
    plt.show()
    

In [None]:
import math
for i in non_binomial:
    print (i)
    p = sea.countplot(data=train_df,x = train_df.iloc[:,i])
    plt.show()
    '''train_df.iloc[:,i] = np.square(train_df.iloc[:,i])
    test_df.iloc[:,i] = np.square(test_df.iloc[:,i])
    p = sea.countplot(data=train_df,x = train_df.iloc[:,i])
    plt.show()'''
    

In [None]:
# Dimension Reduction - dropping features with less than 2% correllation
correlations = train_df.corr()['Target'].sort_values()
correlations = correlations.reset_index().values
print('Original dataset shape: ',train_df.shape)
for column in range (0,len(correlations)-1):
    if correlations[column,1]>-0.02 and correlations[column,1]<0.02 and correlations[column,0]!='SK_ID_CURR':
        train_df=train_df.drop ([correlations[column,0]], axis=1)
print('Dataset shape after adjustments for correlation: ',train_df.shape)

In [None]:
#removing the squared columns as redundant and thus introducing a bias:
print (train_df.shape)
train_df = train_df.drop (['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe','SQBhogar_nin', 'SQBovercrowding','SQBdependency','SQBmeaned', 'agesq'], axis = 1)
print (train_df.shape)

In [None]:
#realligning two datasets
y_df = train_df['Target']
train_df, test_df = train_df.align(test_df, join = 'inner', axis = 1)


In [None]:
print(f"Training set shape:{train_df.shape}, testing set shape:{test_df.shape}")


In [None]:
print (train_df.dtypes.value_counts())
print (test_df.dtypes.value_counts())

In [None]:
#converting to numpy array
X = train_df.values
y = y_df.values
y = y.reshape(-1, 1)
X.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y,test_size = 0.2, random_state = 0)
X_train.shape

In [None]:

y_train.shape

In [None]:
np.unique(y_train)

In [None]:
X_train[0:10,:]

In [None]:
#Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X = sc.transform(X)




In [None]:
X_train[0:10,:]

In [None]:
#tried using ANN
'''#building ANN
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(input_dim = 137, output_dim = 50, init = 'uniform', activation = 'softmax'))
    classifier.add(Dense(output_dim = 100, init = 'uniform', activation = 'relu'))
    classifier.add(Dense(output_dim = 100, init = 'uniform', activation = 'relu'))
    classifier.add(Dense(output_dim = 100, init = 'uniform', activation = 'relu'))
    classifier.add(Dense(output_dim = 5, init = 'uniform', activation = 'sigmoid'))
    classifier.compile (optimizer = optimizer, loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier)
parameters = {'batch_size': [15],
              'epochs': [100, 150, 200, 250],
              'optimizer': ['adam']}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search = grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_
print ('Best accuracy: ',best_accuracy )
print ('Best parameters: ',best_parameters) # 'batch_size': 15, 'epochs': 250, 'optimizer': 'adam'
classifier = Sequential()
classifier.add(Dense(input_dim = test_df.shape[1], output_dim = 50, init = 'uniform', activation = 'tanh'))
classifier.add(Dense(output_dim = 100, init = 'uniform', activation = 'relu'))
classifier.add(Dense(output_dim = 100, init = 'uniform', activation = 'relu'))
classifier.add(Dense(output_dim = 100, init = 'uniform', activation = 'relu'))
classifier.add(Dense(output_dim = 100, init = 'uniform', activation = 'sigmoid'))
classifier.compile (optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
'''



In [None]:
#chose SVC
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
classifier.fit(X_train, y_train) #, batch_size = 15, epochs = 500
#Below I expreimented a little bit with different parameter combinations
#1597 tp for 100 epochs, 1624 tp for 150 epochs 1660 for 500, 1633 for 650, 1631 for 750, 1639 for 1000
#1660 for 3 hidden layers, 1622 with 2 layers, 1645 for 4 layers
#1660 for 100 hidden neurons per layer, 1660 for 200 hidden neurons, 1593 for 50 hidden neurons
#1660 for sigmoid final layer, 0 for tanh!
#1660 with softmax input activation, 1722,1710 with tanh, 1710 with additional 2nd tanh layer, 1694 with softmax as 2nd
#1711 with tanh as middle layer (either 50 or 100 neurons)
#1712 with 5 output neurons, 1695 with 1000
#0.362 fro corr coef 2% (112), 1% - 0.363 (124) ,3% - 0.371, 4% - 0.381 (1735 with X_train), 5% - 1764, 6% - 1759, 1774/1749 with 7%, 1782/1759 with 8%, 1766-1788 for 10%, 1788 for -20%- 8%, 1786-1796 for 15%, 1766 with 20%
#1788 tanh - sigmoid, 1774 sigmoid-sigmoid, 0 tanh-tanh, 1113 for sigmoid-tanh
#0.381 with optimized ANN, 0.338 with kernel SVM 
test_np = test_df.values
y_pred = classifier.predict(X_test) #predict_classes for ANN


In [None]:
y_pred = y_pred.reshape(-1, 1)
y_pred[0:100,0]

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print (cm)


In [None]:
from sklearn.metrics import f1_score
print (f1_score(y_test, y_pred, average ='macro'))


In [None]:
test_np = sc.transform(test_np)
classifier.fit(X, y)
y_pred = classifier.predict(test_np)#_classes
y_pred = y_pred.reshape(-1, 1)
y_pred[0:100,0]

In [None]:
test_df = pd.read_csv('../input/test.csv')
submit = test_df[['Id']]
submit['TARGET'] = y_pred
submit.head()

# Save the submission to a csv file
submit.to_csv('SVMClassification.csv', index = False)

In [None]:
submit.shape