# Python: Introduction to Machine Learning

## Import Libraries 

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

#from sklearn.preprocessing import LabelEncoder  
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

pd.set_option('display.max_columns', None) # Display all columns when there are a lot of columns in dataframe
%matplotlib inline # Display Matplotlib graphs within the Notebook (and note as separate window pop-ups)

UsageError: unrecognized arguments: # Display Matplotlib graphs within the Notebook (and note as separate window pop-ups)


## Import Data 

In [None]:
df = pd.read_csv('Data/loan_train.csv')
df.head()

In [None]:
df.shape

## Exploratory Data Analysis (EDA)

- EDA is an important step in the ML/Data Science pipeline 
- Gain a high-level understanding of the data and its characteristics (data types, rows, columns, missing values, etc.)  
- This step helps provide guidance on how to pre-process the data to prep it for model building 

In [None]:
# Display data about the data (nulls, data types, rows/columns, etc.)
df.info() 

In [None]:
# Check for missing Values
df.isnull().sum()

In [None]:
# Display statistical summary for the data 
df.describe()

In [None]:
df.columns

In [None]:
# List of Unique Values in all of the categorical columns 
categorical_cols = ['Gender','Married','Education','Self_Employed', 'Property_Area']

for col in categorical_cols:
    print(f'Unique Values for {col}: {df[col].unique()}')

In [None]:
# Checking for any repeated records with regards to Loan ID
len(df.Loan_ID.unique())

#### Let's Summarize! 
- Loan ID is the primary key in the data - it uniquely identifies each record 
- There are 614 rows, 13 columns
- The .describe() function can be used to quickly gauge some statistics about the data 
    - In some cases it can also help identify some incorrect data (if this was a biometric dataset with heart-rate, an minimum heartrate of 0 would be a call for investigation!) 
- 7/13 columns have missing values 
- Credit History has the highest number of missing values! 

#### Key Remarks 
- Understanding the data you are working with is very important! 
- Always strive to work with Subjet Matter Experts (SMEs) to get insight into the data 
- In a real-world application, you may need to individually evaluate each column and its values to learn the context behind the data 

## Data Analysis / Data Visualization
- Investigate to find relationships and trends within the data 
- Certain features may be more prominent in determining whether the applicant's loan with be approved or not 
- Data Visualization can help reveal key information in the data 
    - Knowing which graphs to use is a key skills that comes with practice and experience! 
- A good starting point is compare different features against the label (Loan Status) to see if there are any easily distinguishable relationships

In [None]:
# Number of Approved & Not Approved (Y/N) records 
df.Loan_Status.value_counts().plot(kind='barh')

In [None]:
print(df.Loan_Status.value_counts())

#### Gender vs Approval

In [None]:
# Let's understand how different
print(pd.crosstab(df['Gender'],df['Loan_Status']))

sns.countplot(df['Gender'],hue=df['Loan_Status'])
plt.show()

In [None]:
print(pd.crosstab(df['Gender'], df['Loan_Status']).apply(lambda r: round(r/r.sum(),3)*100, axis=1))

In [None]:
df_pct = df.groupby('Gender')['Loan_Status'].value_counts(normalize=True)
df_pct = df_pct.mul(100)
df_pct = df_pct.rename('percent').reset_index()

sns.catplot(x='Gender',y='percent',hue='Loan_Status',kind='bar',data=df_pct)
plt.show()

In [None]:
# Let's write this as a method to make it easy to check the loan status against all the parameters
def column_bar(df, column):
    print(pd.crosstab(df[column],df['Loan_Status']))
    print('\nPercentage')
    print(pd.crosstab(df[column], df['Loan_Status']).apply(lambda r: round(r/r.sum(),3)*100, axis=1))
    sns.countplot(df[column],hue=df['Loan_Status'])
    plt.show()
    df_pct = df.groupby(column)['Loan_Status'].value_counts(normalize=True)
    df_pct = df_pct.mul(100)
    df_pct = df_pct.rename('percent').reset_index()

    sns.catplot(x=column,y='percent',hue='Loan_Status',kind='bar',data=df_pct)
    plt.show()

In [None]:
# Test the function 
column_bar(df, 'Gender')

#### All Categorical Features vs Approval

In [None]:
#Select the data we want to test
columns = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Credit_History']

In [None]:
for column in columns:
    column_bar(df,column)

#### Continuous Features 

In [None]:
# Check Continuous variables
def column_scatter(df,column):
    plt.scatter(df[column], df['Loan_Status']);
    plt.title(column)
    plt.show()

In [None]:
# ApplicantIncome
# CoapplicantIncome
# LoanAmount
# Loan_Amount_Term

scatter_columns= ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
for column in scatter_columns:
    column_scatter(df,column)

In [None]:
# Histogram -----> Review, should we keep this? 

y_loan = df.loc[df.Loan_Status == 'Y']
n_loan = df.loc[df.Loan_Status == 'N']

kwargs = dict(alpha=0.5, bins=50)

plt.figure(figsize=(10,10))
plt.hist(y_loan['ApplicantIncome'], **kwargs, color='g', label='Yes')
plt.hist(n_loan['ApplicantIncome'], **kwargs, color='r', label='No')
plt.legend()
plt.show()

In [None]:
# Let's look at correlation next
sns.heatmap(df.corr(),annot=True)

## Model Development

### Data Preparation

#### Null Values
- There are many ways to deal with NULL values and it can have a significant impact on how your model performs
    - Deleting rows
    - Replacing with Mean, Median, Mode
    - Imputing values (KNN, ML algorithms, etc.) 

In [None]:
# Lets re-check columns with null values 
df.isnull().sum()

In [None]:
# Lets investigate the Loan Amount 
df['LoanAmount'].describe()

In [None]:
# Lets investigate the Loan Amount field 
plt.hist(df['LoanAmount'], bins=50)
plt.show()

In [None]:
# Replace Loan Amount NULL values with Mean
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())

In [None]:
# Remove remaining records with Null values
df.dropna(inplace=True)

In [None]:
#Confirm it worked
df.isnull().sum()

In [None]:
#Check for duplicate entries
df.duplicated().any()

In [None]:
#What's the shape of the new data?
df.shape

In [None]:
df.head()

#### Encoding Categorical Values
- ML models can only deal with numerical values 
- Categorical data has to be encoded as numbers for use in models 
- Common techniques: Ordinal Encoding & One-Hot Encoding
    - We will us the **get_dummies()** function in Pandas to do this, however when building ML for projects, using the **LabelEncoder & OneHotEncoder** modules in Sklearn are recommended 
    - Using get_dummies() functionally creates the same result, and is quicker to easily visualize the concept
- When dealing with categorical data in production, additional solutions/algorithms may be required to deal with unseen categorical values

In [None]:
# We need to replace string data (Y,N), with numbers
df['Loan_Status'].replace('N',0,inplace=True)
df['Loan_Status'].replace('Y',1,inplace=True)

In [None]:
df.head()
df.shape

In [None]:
# One-Hot Encod the features using get_dummies() function in Pandas 

non_numerical = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']

for column in non_numerical:
    enc_df = pd.get_dummies(df[column])
    print(f'{df[column].unique()}')
    df = pd.concat([df,enc_df], axis=1)

In [None]:
pd.set_option('display.max_columns', None)
df.head()

In [None]:
'''
from sklearn.preprocessing import LabelEncoder  
le = LabelEncoder()
non_numerical = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']

for column in non_numerical:
    print(le.fit(df[column]).classes_)
    df[column+'_Encoded'] = le.fit_transform(df[column])

'''

In [None]:
'''
from sklearn.preprocessing import OneHotEncoder

# Create an instance of a one-hot-encoder 
enc = OneHotEncoder(handle_unknown='ignore')

#for column in non_numerical:
enc_df = pd.DataFrame(enc.fit_transform(df[['Gender_Encoded']]).toarray())
    
    # Merge with main df on key_values 
df = pd.merge(df, enc_df, left_index=True, right_index=True)

df.head()
'''

In [None]:
'''# Heatmap Again
plt.figure(figsize=(16,5))
sns.heatmap(df.corr(),annot=True,cmap='jet')
'''

#### Feature Selection
- After analyzing the data, select the features you will use to help build the model 
- You do not always need to use every single feature. With lots of data, removing unnecessary features can save processing time, save costs, and even improve model performance
- Since the categorical features have been encoded, drop the respective non-encoded categorical columns 

In [None]:
# Obvious parameter to drop
df.drop(['Loan_ID','Gender','Married','Dependents','Education','Self_Employed','Property_Area','Female','Male'], axis=1, inplace=True)
df.head()

### Model Development

In [None]:
# Separate data into target and features
Y = df['Loan_Status'].to_frame()
X = df.drop(columns=['Loan_Status'])

In [None]:
validation_size = 0.25
seed = 12
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y,test_size=validation_size,random_state=seed)

In [None]:
print(X_train.shape)
print(X_test.shape)

print('\nTrain & Test Class Counts\n')

print('Training:\n',Y_train.Loan_Status.value_counts())
print('\nTesting:\n',Y_test.Loan_Status.value_counts())

In [None]:
# Create model instances 
lr = LogisticRegression()
knn = KNeighborsClassifier()

models = [lr, knn] 

#### Model Training & Testing & Evaluation

In [None]:
#Model evaluation tools
from sklearn.metrics import classification_report, roc_auc_score, plot_roc_curve

In [None]:
for model in models:
    # Train
    model.fit(X_train, Y_train)
    print(f'\nDone Training: {model}!')
    
    # Test
    print(f'Mean Accuracy: {model.score(X_test,Y_test)}\n')
    
    # Evaluate 
    y_pred = model.predict(X_test)
    print(classification_report(y_pred,Y_test))
    print(f'ROC Score:{roc_auc_score(Y_test,y_pred)}')
    
    plot_roc_curve(model, X_test, Y_test)
    plt.show()
#lr_results = models[1].predict()

#### Remarks

- The average score is not always a true representation of how good a model is, especially for classification
- What if the model has to evaluate between apples & oranges, given there are 90 apples & 10 oranges ? 
    - If the model correctly classifies 90 apples, but only 5/10 organges are correctly classified, the model would still have a high accuracy even though it clearly cannot be trusted to properly classrify oranges 

## Considerations 
 - Test out different algorithms -> Support Vector Machine
 - Iterate over the feature selection process
 - Feature Engineering: Develop your own features from the available data 
