In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Common Functions

In [2]:
def remove_outliers(df_in,outlier_cols):
    df = df_in.copy()
    if type(outlier_cols) == str:
        outlier_cols = [outlier_cols]
    
    desc = df.describe()
    for col in outlier_cols:
        q1 = desc.loc['25%',col]
        q3 = desc.loc['75%',col]
        iqr = q3 - q1
        low = q1 - (1.5 * iqr)
        high = q3 + (1.5 * iqr)
        df = df[(df[col] >= low) & (df[col] <= high)]
        
    return df

In [3]:
def convert_cat(df, cat_cols,output_col=None):
    """
    This function converts all categorical columns into numerical boolean columns.
    
    There are 3 parameters: df, cat_cols, and output_var. 
    
    1. df is the dataframe which needs to have categorical variables converted to numerical variables
    2. cat_cols needs to be a list that contains the names of all categorical columns that need to be converted.
    3. output_var is the name of the output or response variable.  It is set to 'Output' as default."""
    
    df.reset_index(inplace = True, drop = True)
    df_out = df.copy()
    
    for col in cat_cols:
        dummy_col = pd.get_dummies(df_out[col],drop_first = True)
        df_out.drop(col,axis=1, inplace = True)
        df_out = df_out.join(dummy_col)
    
    if output_col != None:
        loc_df_out = list(df_out.columns).index(output_col)
        df_out = df_out[list(df_out.columns[:loc_df_out]) + list(df_out.columns[loc_df_out + 1:]) + list(df_out.columns[loc_df_out:loc_df_out + 1])]
    return df_out

In [4]:
def normalize(df,cols_to_ignore = None):
    """
    This function takes in a dataframe as a parameter and returns the same dataframe with all the features normalized between 0 and 1 using rescaling (min-max normalization)
    """
    l_min = []
    l_max = []
    desc = df.describe()
    if cols_to_ignore == None:
        for col in df.columns:
            l_min.append(desc[col]['min'])
            l_max.append(desc[col]['max'])
        
        t_min = list(zip(df.columns, l_min))
        t_max = list(zip(df.columns, l_max))
 

    else:
        for col in df.drop(cols_to_ignore,axis = 1).columns:
            l_min.append(desc[col]['min'])
            l_max.append(desc[col]['max'])

        t_min = list(zip(df.drop(cols_to_ignore,axis = 1).columns, l_min))
        t_max = list(zip(df.drop(cols_to_ignore,axis = 1).columns, l_max))
    
   
    d_min = {}
    for col,val in t_min:
        d_min[col]=val
    
    d_max = {}
    for col,val in t_max:
        d_max[col]=val
    
    df_copy = df.copy()
    for key in d_min.keys():
        df_copy[key] = df_copy[key].apply(lambda x: (x - d_min[key])/ (d_max[key] - d_min[key]))
    
    return df_copy

In [5]:
def split_df(df,test_size = 0.3,df_to_return = 'df_train'):
    """
    This function takes in a Pandas DataFrame and returns a 
    dataframe that is a subset of that Pandas DataFrame.
    
    There are 3 parameters: df, test_size, and df_to_return
    
    df needs to be a Pandas DataFrame and is the superset dataframe to be divided.
    test_size is the proportion of the dataframe you want to be the testing dataset.
    test_size is set to 0.3 by default.
    df_to_return needs to specified as either 'df_train' or df_test' 
    to return the correct subset dataframe. df_to_return is set to 'df_train' by default
    """
    split_num = int(df.count()[0] * (1-test_size) //1)
    df_train = df.iloc[:split_num,:]
    df_test = df.iloc[split_num:,:]
    if df_to_return in ['df_train','train']:
        return df_train
    elif df_to_return in ['df_test','test']:
        return df_test

In [6]:
def shuffled_split_dfs(df,test_size = 0.3):
    """
    This function takes in a Pandas DataFrame and returns a list of 2
    dataframes.  The first dataframe is the train and the second is the test df.
    
    There are 2 parameters: df and test_size
    
    df needs to be a Pandas DataFrame and is the superset dataframe to be divided.
    test_size is the proportion of the dataframe you want to be the testing dataset.
    test_size is set to 0.3 by default.
    """
    df_copy = df.copy()
    df_copy = df_copy.sample(frac = 1).reset_index(drop = True)
    split_num = int(df_copy.count()[0] * (1-test_size) //1)
    df_train = df_copy.iloc[:split_num,:]
    df_test = df_copy.iloc[split_num:,:]
    return ([df_train,df_test])

In [54]:
def check_accuracy(df = None,pred_df = None, test_df = None, algo = 'lin',target_class = None):
    
    """
    This function takes in a pandas DataFrame and returns the accuracy of the model
    
    There are 5 parameters: df and algo
    
    1. df needs to be a Pandas DataFrame and algo is the algorithm used.
    2. pred_df is the prediction dataframe used for the knn algorithm
    3. test_df is the test dataframe used for the knn algorithm
    4. algo is set to 'lin' by default but can also be specified as 'log' or 'knn'
    5. target_class is the output_variable
    """
    
    if algo == 'lin':
        df_out = df.copy()
        df_out['error'] = df.iloc[:,-2] - df.iloc[:,-1]
        ME = df_out['error'].mean()
        MAE = np.abs(df_out['error']).mean()
        RMSE = (sum(df_out['error']**2)/df_out.count()[0]+1) ** 0.5
        return {'ME':ME,'MAE':MAE,'RMSE':RMSE}
    
    elif algo == 'log':
        import numpy as np
        from sklearn.metrics import confusion_matrix,classification_report
        if target_class == None:
            target_class = df.columns[-4]
        print(confusion_matrix(df[target_class].values,df['Crisp'].values))
        print(classification_report(df[target_class].values,df['Crisp'].values))
    
        return sum(df['Correct?']/df.count()[0])
    
    elif algo == 'knn':
        pred = pred_df[target_class]
        test = test_df[target_class]
        return sum(pred == test) / len(pred)

# Linear Regression and Logistic Regression

In [8]:
def stoch_grad_desc(dataset,output_col,cols_to_ignore = None,alpha = 0.3,epoch = 10,algo = 'lin',initial_coeffs = 1):
    """
    This function returns a list of the coefficients for the specified algorithm.  
    Currently, this function only performs Linear and Logistic Regression.
    
    The 4 parameters are: dataset, alpha, epoch, and algo
    
    1. dataset needs to be a pandas DataFrame
    2. alpha is the alpha value used in stochiastic gradient descent.  It is set at 0.3 by default.
    3. epoch is the number of iterations through each row in the dataset algorithm will perform.  epoch is set to 10 by default.
    4. algo is the specific algorithm to be used.  algo is 'lin' by default for Linear Regression but can also be specified as 'log' for Logistic Regression
    """

    loc_output_col = list(dataset.columns).index(output_col)
    dataset = dataset[list(dataset.columns[:loc_output_col]) + list(dataset.columns[loc_output_col+1:]) + list(dataset.columns[loc_output_col:loc_output_col + 1])]
    
    from math import exp
    count_rows = dataset.count()[1]
    
    if cols_to_ignore != None:
        dataset_1 = pd.DataFrame(pd.Series(np.ones(dataset.count()[0])),columns = ['X0']).join(dataset.drop(cols_to_ignore,axis = 1))
    else:
        dataset_1 = pd.DataFrame(pd.Series(np.ones(dataset.count()[0])),columns = ['X0']).join(dataset)
        
    coeffs = list(np.ones(len(dataset_1.columns[0:-1])))
    coeffs = [i * initial_coeffs for i in coeffs]
    
    df_columns = list(dataset_1.columns)
    df_input_cols = df_columns
    df_input_cols.remove(output_col)
    
    for ep in range(epoch):
        for row in range(count_rows):
            y = dataset_1.loc[row,output_col]
            output_terms = []
            #return y
            for col in df_input_cols:
                output_terms.append((coeffs[dataset_1.columns.get_loc(col)],dataset_1.loc[row,col]))
            
            output_list = [(x*y) for (x,y) in output_terms]
            
            output = sum(output_list)
            
            if algo == 'lin':
                pred = output
                for i in range(len(coeffs)):
                    coeffs[int(i)] += alpha * (dataset_1.loc[row,output_col] - pred) * dataset_1.iloc[row,int(i)]
                
            elif algo == 'log':
                pred = 1 / (1 + exp(-output))
                
                for i in range(len(coeffs)):
                    coeffs[i] = coeffs[i] + alpha * (y - pred) * pred * (1 - pred) * dataset_1.iloc[row,i]
                    
    
    
    return coeffs

In [9]:
def make_predictions(dataset, coeff_list, output_col,cols_to_ignore = None,algo = 'lin'):
    """
    This function takes in a pandas DataFrame and a list that contains 
    coefficients for the specified algorith used in the stoch_grad_desc function
    and returns the same dataset (with the addition of the first column being 1s to
    represent X0 in the regression formula) plus a new column at the end, 'Prediction'.
    
    There are 3 parameters: dataset, coeff_list, and algo
    
    1. dataset is the dataframe to used to make predictions dataset needs to be a Pandas DataFrame
    2. coeff_list should be the list that was the result of running the stoch_grad_desc function
    3. algo is the specific algorithm to be used.  algo is set to 'lin' by default but can be set to 'log'"""
    
    loc_output_col = list(dataset.columns).index(output_col)
    dataset = dataset[list(dataset.columns[:loc_output_col]) + list(dataset.columns[loc_output_col+1:]) + list(dataset.columns[loc_output_col:loc_output_col + 1])]
    dataset_index = dataset.index
    
    if type(cols_to_ignore) == list and cols_to_ignore != None:
        df_ignored_cols = dataset.loc[:,cols_to_ignore]
    elif type(cols_to_ignore) != list and cols_to_ignore != None:
        df_ignored_cols = dataset.loc[:,[cols_to_ignore]]
    if cols_to_ignore != None:
        df_ignored_cols.reset_index(inplace = True)
    
    from math import exp
    
    dataset.reset_index(inplace = True, drop = True)
    
    if cols_to_ignore != None:
        dataset_out = pd.DataFrame(pd.Series(np.ones(dataset.count()[0]))).join(dataset.drop(cols_to_ignore,axis = 1))
    else:
        dataset_out = pd.DataFrame(pd.Series(np.ones(dataset.count()[0]))).join(dataset)

    
    dataset_out.rename(mapper = {0:'X0'},axis = 1, inplace = True)
    
    coeffs = coeff_list
    pred = []
    
    for row in range(dataset_out.count()[0]):
        output_terms = []
        for col in dataset_out.columns[0:-1]:
            output_terms.append((coeffs[dataset_out.columns.get_loc(col)],dataset_out.loc[row,col]))
        output_list = [x*y for (x,y) in output_terms]
        
        output = sum(output_list)
        if algo == 'lin':
            pred.append(output)
        
        elif algo == 'log':
            pred.append(1/(1 + exp(-output)))
    dataset_out = dataset_out.join(pd.DataFrame(pred))
    dataset_out.rename(mapper = {0: 'Prediction'},axis = 1, inplace = True)
    
    if algo == 'log':
        dataset_out['Crisp'] = dataset_out['Prediction'].apply(lambda predi: 1 if predi >= 0.5 else 0)
        dataset_out['Correct?'] = dataset_out.iloc[:,-3] == dataset_out['Crisp']
    
    if cols_to_ignore != None:
        dataset_out = pd.concat([df_ignored_cols,dataset_out],axis = 1)
    dataset_out.drop('X0',axis = 1,inplace = True)
    
    dataset_out.set_index(dataset_index,inplace = True)
    return dataset_out

# Logistic Regression (Titanic Dataset)

In [138]:
titanic_test = pd.read_csv(r'C:\Users\ssiva\Downloads\Python-Data-Science-and-Machine-Learning-Bootcamp\Python-Data-Science-and-Machine-Learning-Bootcamp\Machine Learning Sections\Logistic-Regression\titanic_test.csv')

In [139]:
titanic_train = pd.read_csv(r'C:\Users\ssiva\Downloads\Python-Data-Science-and-Machine-Learning-Bootcamp\Python-Data-Science-and-Machine-Learning-Bootcamp\Machine Learning Sections\Logistic-Regression\titanic_train.csv')

In [140]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [141]:
titanic_train.drop(['PassengerId','Name','Ticket'],axis = 1,inplace = True)

In [142]:
titanic_train.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [143]:
titanic_train.drop(['Cabin'],axis = 1,inplace = True)
titanic_test.drop(['Cabin'],axis = 1,inplace = True)

In [145]:
titanic_train.corr()['Age']

Survived   -0.077221
Pclass     -0.369226
Age         1.000000
SibSp      -0.308247
Parch      -0.189119
Fare        0.096067
Name: Age, dtype: float64

In [146]:
titanic_train.groupby('Pclass').mean()['Age']

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [147]:
d_Pclass_to_Age = {}
for i in titanic_train.groupby('Pclass').mean()['Age'].index:
    d_Pclass_to_Age[i] = titanic_train.groupby('Pclass').mean()['Age'].loc[i]
d_Pclass_to_Age

{1: 38.233440860215055, 2: 29.87763005780347, 3: 25.14061971830986}

In [148]:
import math

In [151]:
titanic_train['Age'] = titanic_train[["Pclass","Age"]].apply(lambda cols: d_Pclass_to_Age[cols[0]] if math.isnan(cols[1]) else cols[1],axis = 1)
titanic_test['Age'] = titanic_test[["Pclass","Age"]].apply(lambda cols: d_Pclass_to_Age[cols[0]] if math.isnan(cols[1]) else cols[1],axis = 1)

In [152]:
titanic_train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [153]:
titanic_train.dropna(inplace = True)
titanic_test.dropna(inplace = True)

In [154]:
titanic_train.count()[0]

889

In [156]:
titanic_train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [157]:
tit_train_cnvrtd = convert_cat(titanic_train,['Sex','Embarked'],output_col = 'Survived')
tit_train_cnvrtd.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Survived
0,3,22.0,1,0,7.25,1,0,1,0
1,1,38.0,1,0,71.2833,0,0,0,1
2,3,26.0,0,0,7.925,0,0,1,1
3,1,35.0,1,0,53.1,0,0,1,1
4,3,35.0,0,0,8.05,1,0,1,0


In [158]:
tit_test_cnvrtd = convert_cat(titanic_test,['Sex','Embarked'])
tit_test_cnvrtd.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,male,Q,S
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,1,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,1,1,0
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,1,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,0,0,1


In [159]:
[tit_train,tit_test] = shuffled_split_dfs(tit_train_cnvrtd)

In [160]:
tit_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Survived
0,1,30.0,0,0,31.0,0,0,0,1
1,2,33.0,1,2,27.75,0,0,1,1
2,1,42.0,0,0,26.2875,1,0,1,1
3,3,20.0,0,0,4.0125,1,0,0,0
4,3,25.14062,0,0,7.8958,1,0,1,0


In [161]:
tit_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Survived
622,3,21.0,0,0,7.8,1,0,1,0
623,3,32.0,0,0,7.925,1,0,1,0
624,3,21.0,0,0,7.775,1,0,1,0
625,3,40.0,1,4,27.9,1,0,1,0
626,2,38.0,0,0,13.0,0,0,1,0


In [162]:
tit_train_norm = normalize(tit_train)
tit_test_norm = normalize(tit_test)

In [163]:
tit_train_norm.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Survived
0,0.0,0.371701,0.0,0.0,0.060508,0.0,0.0,0.0,1.0
1,0.5,0.409399,0.125,0.333333,0.054164,0.0,0.0,1.0,1.0
2,0.0,0.522493,0.0,0.0,0.05131,1.0,0.0,1.0,1.0
3,1.0,0.246042,0.0,0.0,0.007832,1.0,0.0,0.0,0.0
4,1.0,0.310639,0.0,0.0,0.015412,1.0,0.0,1.0,0.0


In [164]:
coeffs_list = stoch_grad_desc(tit_train_norm,output_col = 'Survived',alpha = 0.1, epoch = 150,algo='log')

In [165]:
coeffs_list

[4.455458692742193,
 -2.6925503283033336,
 -3.579250111601648,
 -3.4683052327819657,
 -0.31169918859296575,
 1.1578828218699968,
 -2.8185212183764157,
 0.40999863232419004,
 -0.2509700991412977]

In [166]:
tit_train_pred = make_predictions(tit_train_norm,coeff_list = coeffs_list,algo = 'log',output_col = 'Survived')

In [167]:
tit_test_pred = make_predictions(tit_test_norm,coeff_list = coeffs_list,algo = 'log',output_col = 'Survived')

In [168]:
check_accuracy(tit_train_pred,algo = 'log')

[[332  44]
 [ 72 174]]
              precision    recall  f1-score   support

         0.0       0.82      0.88      0.85       376
         1.0       0.80      0.71      0.75       246

    accuracy                           0.81       622
   macro avg       0.81      0.80      0.80       622
weighted avg       0.81      0.81      0.81       622



0.8135048231511206

In [169]:
check_accuracy(tit_test_pred,algo = 'log')

[[153  20]
 [ 30  64]]
              precision    recall  f1-score   support

         0.0       0.84      0.88      0.86       173
         1.0       0.76      0.68      0.72        94

    accuracy                           0.81       267
   macro avg       0.80      0.78      0.79       267
weighted avg       0.81      0.81      0.81       267



0.8127340823970053