In [354]:
# This is my COMP2200 Data Science Portfolio 4 


## Data Science Portfolio Part 4

The goal of the second analysis task is to train linear regression models to predict users' ratings towards items. This involves a standard Data Science workflow: exploring data, building models, making predictions, and evaluating results. In this task, we will explore the impacts of feature selections and different sizes of training/testing data on the model performance. We will use another cleaned combined e-commerce sub-dataset that **is different from** the one in “Analysis of an E-commerce Dataset” task 1.

### Import Cleaned E-commerce Dataset
The csv file named 'cleaned_ecommerce_dataset.csv' is provided. You may need to use the Pandas method, i.e., `read_csv`, for reading it. After that, please print out its total length.

### Explore the Dataset

* Use the methods, i.e., `head()` and `info()`, to have a rough picture about the data, e.g., how many columns, and the data types of each column.
* As our goal is to predict ratings given other columns, please get the correlations between helpfulness/gender/category/review and rating by using the `corr()` method.
* To get the correlations between different features, you may need to first convert the categorical features (i.e., gender, category and review) into numerial values. For doing this, you may need to import `OrdinalEncoder` from `sklearn.preprocessing` (refer to the useful exmaples [here](https://pbpython.com/categorical-encoding.html))
* Please provide ___necessary explanations/analysis___ on the correlations, and figure out which are the ___most___ and ___least___ corrleated features regarding rating. Try to ___discuss___ how the correlation will affect the final prediction results, if we use these features to train a regression model for rating prediction. In what follows, we will conduct experiments to verify your hypothesis.

In [355]:
# Import Library   

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import r2_score

import matplotlib.pylab as plt
%matplotlib inline


# Import ignore warnings 

import warnings
warnings.filterwarnings('ignore')

In [356]:
# Load World Development Indicators data into this notebook

df = pd.read_csv("WorldDevelopmentIndicators.csv")

## 1. Exploring the dataset

In [357]:
# Printing the length of the dataset before cleaning

print('Length of the dataset before cleaning: ', len(df))

In [358]:
# Displaying the dataset using the method head()

df.head()

In [359]:
# Displaying the dataframe before cleaning

df

In [360]:
# Displaying a summary of the dataframe

df.info()

In [361]:
# Printing the number of countries in the dataset

print('Number of countries:', len(df['Country'].unique()))

In [362]:
# Printing the number of years the data has been collected across

print('Number of years:', len(df['Year'].unique()))

In [363]:
# Displaying the dimensions of the dataset before cleaning

df.shape

In [364]:
# Displaying descriptive statistics for Life Expectancy

df['LifeExpectancy'].describe()

In [365]:
# Displaying the country with the lowest life expectancy across all years

LifeExpectancyMin = df['LifeExpectancy'].min()
print(df.loc[df['LifeExpectancy'] == LifeExpectancyMin, ['Country', 'Year', 'LifeExpectancy']])

In [366]:
# Displaying the country with the highest life expectancy across all years

LifeExpectancyMax = df['LifeExpectancy'].max()
print(df.loc[df['LifeExpectancy'] == LifeExpectancyMax, ['Country', 'Year', 'LifeExpectancy']])

In [367]:
# Displaying the country with the lowest life expectancy in 2020

Data2020 = df.loc[df['Year'] == 2020]
LifeExpectancyMin2020 = Data2020['LifeExpectancy'].min()
print(df.loc[df['LifeExpectancy'] == LifeExpectancyMin2020, ['Country', 'Year', 'LifeExpectancy']])

In [368]:
# Displaying the country with the highest life expectancy in 2020

Data2020 = df.loc[df['Year'] == 2020]
LifeExpectancyMax2020 = Data2020['LifeExpectancy'].max()
print(df.loc[df['LifeExpectancy'] == LifeExpectancyMax2020, ['Country', 'Year', 'LifeExpectancy']])

## 2. Cleaning the dataset

In [369]:
# Counting the number of null values in each column

print('Number of null values in Country:', df['Country'].isna().sum())
print('Number of null values in Year:', df['Year'].isna().sum())
print('Number of null values in Agriculture:', df['Agriculture'].isna().sum())
print('Number of null values in Exports:', df['Exports'].isna().sum())
print('Number of null values in FertilityRate:', df['FertilityRate'].isna().sum())
print('Number of null values in GDP:', df['GDP'].isna().sum())
print('Number of null values in Immunisation:', df['Immunisation'].isna().sum())
print('Number of null values in Imports:', df['Imports'].isna().sum())
print('Number of null values in Industry:', df['Industry'].isna().sum())
print('Number of null values in Inflation:', df['Inflation'].isna().sum())
print('Number of null values in MerchandiseTrade:', df['MerchandiseTrade'].isna().sum())
print('Number of null values in MilitaryExpenditure:', df['MilitaryExpenditure'].isna().sum())
print('Number of null values in MortalityRateU5:', df['MortalityRateU5'].isna().sum())
print('Number of null values in NetMigration:', df['NetMigration'].isna().sum())
print('Number of null values in DevelopmentAssistanceAndAid:', df['DevelopmentAssistanceAndAid'].isna().sum())
print('Number of null values in PopulationDensity:', df['PopulationDensity'].isna().sum())
print('Number of null values in PopulationGrowth:', df['PopulationGrowth'].isna().sum())
print('Number of null values in PrimarySchoolEnrollment:', df['PrimarySchoolEnrollment'].isna().sum())
print('Number of null values in UrbanPopulationGrowth:', df['UrbanPopulationGrowth'].isna().sum())
print('Number of null values in LifeExpectancy:', df['LifeExpectancy'].isna().sum())

In [370]:
# Removing records where values are missing

clean_df = df.dropna(subset=['Agriculture', 
                             'Exports', 
                             'FertilityRate', 
                             'GDP', 
                             'Immunisation', 
                             'Imports', 
                             'Industry', 
                             'Inflation', 
                             'MerchandiseTrade', 
                             'MilitaryExpenditure', 
                             'MortalityRateU5', 
                             'NetMigration', 
                             'DevelopmentAssistanceAndAid', 
                             'PopulationDensity', 
                             'PopulationGrowth', 
                             'PrimarySchoolEnrollment', 
                             'UrbanPopulationGrowth', 
                             'LifeExpectancy'])

In [371]:
# Printing the length of the dataset after cleaning

print('Length of the dataset after cleaning: ', len(clean_df))

In [372]:
# Displaying a summary of the cleaned dataset

clean_df.info()

In [373]:
# Counting the number of null values in each column after cleaning

print('Number of null values in Country:', clean_df['Country'].isna().sum())
print('Number of null values in Year:', clean_df['Year'].isna().sum())
print('Number of null values in Agriculture:', clean_df['Agriculture'].isna().sum())
print('Number of null values in Exports:', clean_df['Exports'].isna().sum())
print('Number of null values in FertilityRate:', clean_df['FertilityRate'].isna().sum())
print('Number of null values in GDP:', clean_df['GDP'].isna().sum())
print('Number of null values in Immunisation:', clean_df['Immunisation'].isna().sum())
print('Number of null values in Imports:', clean_df['Imports'].isna().sum())
print('Number of null values in Industry:', clean_df['Industry'].isna().sum())
print('Number of null values in Inflation:', clean_df['Inflation'].isna().sum())
print('Number of null values in MerchandiseTrade:', clean_df['MerchandiseTrade'].isna().sum())
print('Number of null values in MilitaryExpenditure:', clean_df['MilitaryExpenditure'].isna().sum())
print('Number of null values in MortalityRateU5:', clean_df['MortalityRateU5'].isna().sum())
print('Number of null values in NetMigration:', clean_df['NetMigration'].isna().sum())
print('Number of null values in DevelopmentAssistanceAndAid:', clean_df['DevelopmentAssistanceAndAid'].isna().sum())
print('Number of null values in PopulationDensity:', clean_df['PopulationDensity'].isna().sum())
print('Number of null values in PopulationGrowth:', clean_df['PopulationGrowth'].isna().sum())
print('Number of null values in PrimarySchoolEnrollment:', clean_df['PrimarySchoolEnrollment'].isna().sum())
print('Number of null values in UrbanPopulationGrowth:', clean_df['UrbanPopulationGrowth'].isna().sum())
print('Number of null values in LifeExpectancy:', clean_df['LifeExpectancy'].isna().sum())

In [374]:
# Displaying the dataframe after cleaning

clean_df

In [375]:
# Printing the shape of the dataset after cleaning 

clean_df.shape

## 3. Removing outliers

In [376]:
# Printing the length of the dataset after before removing outliers

print('Length of the dataset before removing outliers: ', len(clean_df))

In [377]:
# Printing the number of records in each 5 year timespan (and prior to 1980)

print('Number of records in 1960-1979:', len(clean_df[(clean_df.Year >= 1960) & (clean_df.Year < 1980)]))
print('Number of records in 1980-1984:', len(clean_df[(clean_df.Year >= 1980) & (clean_df.Year < 1985)]))
print('Number of records in 1985-1989:', len(clean_df[(clean_df.Year >= 1985) & (clean_df.Year < 1990)]))
print('Number of records in 1990-1994:', len(clean_df[(clean_df.Year >= 1990) & (clean_df.Year < 1995)]))
print('Number of records in 1995-1999:', len(clean_df[(clean_df.Year >= 1995) & (clean_df.Year < 2000)]))
print('Number of records in 2000-2004:', len(clean_df[(clean_df.Year >= 2000) & (clean_df.Year < 2005)]))
print('Number of records in 2005-2009:', len(clean_df[(clean_df.Year >= 2005) & (clean_df.Year < 2010)]))
print('Number of records in 2010-2014:', len(clean_df[(clean_df.Year >= 2010) & (clean_df.Year < 2015)]))
print('Number of records in 2015-2019:', len(clean_df[(clean_df.Year >= 2015) & (clean_df.Year < 2020)]))
print('Number of records in 2020-2024:', len(clean_df[(clean_df.Year >= 2020) & (clean_df.Year < 2025)]))

In [378]:
# Identifying records collected prior to 1985

before1985 = (clean_df['Year']) < 1985

In [379]:
# Removing records collected prior to 1985

clean_df_2 = clean_df.drop(clean_df.index[before1985])

In [380]:
# Printing the length of the dataset after removing outliers

print('Length of the dataset after removing outliers: ', len(clean_df_2))

In [381]:
# Saving the cleaned dataset as a new file

clean_df_2.to_csv('WorldDevelopmentIndicatorsClean.csv', index=False)

# 4. Exploring cleaned dataset

In [382]:
df = pd.read_csv("WorldDevelopmentIndicatorsClean.csv")

In [383]:
# Printing the length of the dataset before cleaning

print('Length of the cleaned WorldDevelopmentIndicators dataset: ', len(df))

In [384]:
# Displaying the dataframe of the cleaned WorldDevelopmentIndicators dataset

df

In [385]:
# Displaying a summary of the dataframe

df.info()

In [386]:
# Printing the number of countries in the dataset

print('Number of countries:', len(df['Country'].unique()))

In [387]:
# Printing the number of years the data has been collected across

print('Number of years:', len(df['Year'].unique()))

In [388]:
# Displaying the number of records for each country

df.groupby(['Country'])['LifeExpectancy'].count().reset_index(name='Count')

In [389]:
# Displaying the number of records for each country in ascending order

(df.groupby(['Country'])['LifeExpectancy'].count().reset_index(name='Count')).sort_values("Count")

In [390]:
# Displaying descriptive statistics for Life Expectancy

df['LifeExpectancy'].describe()

## 5. Plotting the dataset

In [391]:
# Finding the average value for each Indicator each year to be graphed

byIndicator = df.groupby(['Year'])['Agriculture', 
            'Exports', 
            'FertilityRate', 
            'GDP', 
            'Immunisation', 
            'Imports', 
            'Industry', 
            'Inflation', 
            'MerchandiseTrade', 
            'MilitaryExpenditure', 
            'MortalityRateU5', 
            'NetMigration', 
            'DevelopmentAssistanceAndAid', 
            'PopulationDensity', 
            'PopulationGrowth', 
            'PrimarySchoolEnrollment', 
            'UrbanPopulationGrowth', 
            'LifeExpectancy'].mean()
byIndicator.head()

In [392]:
# Plotting the average trends of each Indicator over time

plt.rcParams.update(plt.rcParamsDefault)

byIndicator.plot.line(subplots = True, figsize = (15, 30),sharex = True)
plt.style.use('classic')
plt.show()

In [393]:
# Finding the average Life Expectancy for each Year each year to be graphed

byYear = df.groupby(['Year'])['LifeExpectancy'].mean().reset_index(name='MeanLifeExpectancy')
byYear.head()

In [394]:
# Graphing life expectancy against year

plt.rcParams.update(plt.rcParamsDefault)

byYear.plot.line(x = 'Year', 
                 y = 'MeanLifeExpectancy', 
                 title = 'Line Graph of Life Expectancy against Year', 
                 color = 'indigo')
plt.style.use('classic')
plt.show()

In [395]:
# Importing sns package

import seaborn as sns

In [396]:
# Graphing Life expectancy and year by country

plt.rcParams.update(plt.rcParamsDefault)

sns.set(rc={'figure.figsize':(20,30)})
sns.scatterplot(data = df, 
                x = "Year", 
                y = "LifeExpectancy", 
                hue = "Country"
               #size="size", sizes=(20, 200), hue_norm=(0, 7), legend="full"
               )
plt.style.use('classic')
plt.title('Scatterplot of Life Expectancy by Year with Country')
plt.show()

## 6. Exploring the correlation of the dataset

In [397]:
# Displaying the head of the dataset

df.head()

In [398]:
# Importing ordinal encoder

from sklearn.preprocessing import OrdinalEncoder

In [399]:
# Declaring ordinal encoder - datatype is integer

enc = OrdinalEncoder(dtype = int)

In [400]:
# Converting 'Country' to numerical data - creating column 'CountryCode'

df['CountryCode'] = enc.fit_transform(df[['Country']])

In [401]:
# Displaying the head of the dataset

df.head()

In [402]:
# Displaying 'Country' with new code column 'CountryCode'

df[["Country", "CountryCode"]]

In [403]:
# Removing the 'Country' column which has now been replaced by 'CountryCode'

df = df.drop(columns = ['Country'])

In [404]:
# Making 'LifeExpectancy' a categorial variable in preparation for logistic regression

df1 = df
df1.round({"LifeExpectancy":2})
df1

In [405]:
# Displaying the head of the dataset

df.head()

In [406]:
# Finding the correlations between all indicators

df[['Year',
    'CountryCode',
    'Agriculture', 
    'Exports', 
    'FertilityRate', 
    'GDP', 
    'Immunisation', 
    'Imports', 
    'Industry', 
    'Inflation', 
    'MerchandiseTrade', 
    'MilitaryExpenditure', 
    'MortalityRateU5', 
    'NetMigration', 
    'DevelopmentAssistanceAndAid', 
    'PopulationDensity', 
    'PopulationGrowth', 
    'PrimarySchoolEnrollment', 
    'UrbanPopulationGrowth',
    'LifeExpectancy']].corr()

In [407]:
# Printing the correlation between all Indicators and Life expectancy

print('correlation between Year and LifeExpectancy:', df['Year'].corr(df['LifeExpectancy']))
print('correlation between CountryCode and LifeExpectancy:', df['CountryCode'].corr(df['LifeExpectancy']))
print('correlation between Agriculture and LifeExpectancy:', df['Agriculture'].corr(df['LifeExpectancy']))
print('correlation between Exports and LifeExpectancyy:', df['Exports'].corr(df['LifeExpectancy']))
print('correlation between FertilityRate and LifeExpectancy:', df['FertilityRate'].corr(df['LifeExpectancy']))
print('correlation between GDP and LifeExpectancy:', df['GDP'].corr(df['LifeExpectancy']))
print('correlation between Immunisation and LifeExpectancy:', df['Immunisation'].corr(df['LifeExpectancy']))
print('correlation between Imports and LifeExpectancy:', df['Imports'].corr(df['LifeExpectancy']))
print('correlation between Industry and LifeExpectancy:', df['Industry'].corr(df['LifeExpectancy']))
print('correlation between Inflation and LifeExpectancy:', df['Inflation'].corr(df['LifeExpectancy']))
print('correlation between MerchandiseTrade and LifeExpectancy:', df['MerchandiseTrade'].corr(df['LifeExpectancy']))
print('correlation between MortalityRateU5 and LifeExpectancy:', df['MortalityRateU5'].corr(df['LifeExpectancy']))
print('correlation between NetMigration and LifeExpectancy:', df['NetMigration'].corr(df['LifeExpectancy']))
print('correlation between DevelopmentAssistanceAndAid and LifeExpectancy:', df['DevelopmentAssistanceAndAid'].corr(df['LifeExpectancy']))
print('correlation between PopulationDensity and LifeExpectancy:', df['PopulationDensity'].corr(df['LifeExpectancy']))
print('correlation between PopulationGrowth and LifeExpectancy:', df['PopulationGrowth'].corr(df['LifeExpectancy']))
print('correlation between PrimarySchoolEnrollment and LifeExpectancy:', df['PrimarySchoolEnrollment'].corr(df['LifeExpectancy']))
print('correlation between UrbanPopulationGrowth and LifeExpectancy:', df['UrbanPopulationGrowth'].corr(df['LifeExpectancy']))

In [408]:
# Using heatmap to show correlations

plt.figure(figsize=(15,10))
sns.heatmap(data= df.corr(), annot=True, cmap='viridis')
plt.show()

In [409]:
# Defining X and Y
X = df.drop(['LifeExpectancy'], axis = 1)
y = df['LifeExpectancy']

In [410]:
#Correlation with independent variable

X.corrwith(df.LifeExpectancy).plot.bar(figsize = (15, 10), title = "Correlation with LifeExpectancy", fontsize = 10,grid = True)

plt.show()

In [411]:
# Custom correlation matrix

sns.set(style="white")

# Using df.corr() to set up the correlation matrix
corr = df.corr()

# Covering the upper diagonal of the matrix
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Setting up the figure
fig, ax = plt.subplots(figsize=(10, 10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(500, 40, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

## 7. Training a Logistic Regression model to predict LifeExpectancy

In [412]:
# Importing the train_test_split package

from sklearn.model_selection import train_test_split

In [413]:
# Splitting the dataset - testing size of 20% - randomstate = 42

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['LifeExpectancy']), df['LifeExpectancy'], test_size = 0.2, random_state = 42)

In [414]:
# Printing the shape of the training and testing sets

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

In [415]:
# Importing the Logistic Regression Model

from sklearn.linear_model import LogisticRegression

In [416]:
# Training the Logistic Regression Model to predict 'rating' bsed on other features

clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
# Importing the accuracy_score package

from sklearn.metrics import accuracy_score

In [None]:
# Evaluating the accuracy of the model

y_pred = clf.predict(X_test)
print("Accuracy of logistic regression: ", accuracy_score(y_test, y_pred))

In [None]:
# Applying K-folds validation
 
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator= classifier, X=X_train, y=y_train, cv=10)
accuracies.mean()
accuracies.std()
print('Logistic Regression Accuracy: %0.3f (+/- %0.3f)' % (accuracies.mean(), accuracies.std() * 2))

## 8. Using RFE to improve Logistic Regression model

In [None]:
# Using RFE to optimise number of features

estimator = LogisticRegression()
acc_scores = []
for i in range(1, 10):
    selector = RFE(estimator,n_features_to_select=i)
    selector = selector.fit(X_train, y_train)
    supp = selector.get_support()

    predicted = selector.predict(X_test)
    acc_score = accuracy_score(y_test, predicted)
    acc_scores.append(acc_score)

best = 1
for item in acc_scores:
    if item < acc_scores[best - 1]:
        best = acc_scores.index(item) + 1

plt.grid()
plt.xlabel('# No. of features')
plt.ylabel('Accuracy score on test set')
plt.plot(range(1, 10), acc_scores, marker = 'o', color = 'blueviolet', markeredgewidth = 1 ,markeredgecolor = 'royalblue', markerfacecolor = 'royalblue')
plt.plot(best, acc_scores[best-1], marker = 'o', markerfacecolor = 'blueviolet')
plt.grid()
plt.show()

In [None]:
# Importing RFE package

from sklearn.feature_selection import RFE

# Selecting the 2 best features 
rfe = RFE(clf, n_features_to_select= 2)
rfe = rfe.fit(X_train, y_train)

# Summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
X_train.columns[rfe.support_]

In [None]:
# Summarising all features

for i in range(X_train.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

In [None]:
# Creating RFE object

lr_model = LogisticRegression()
rfe = RFE(estimator=lr_model, n_features_to_select=2, step=1)
rfe.fit(X_train, y_train)

In [None]:
# Evaluating RFE

y_pred = rfe.predict(X_test)
print("accuracy score on test set: ", accuracy_score(y_test, y_pred)) 

## Improving by inspection

In [None]:
# Splitting the dataset - testing size of 20% - randomstate = 42

# Separating dataframe into data and target

data = df[["MortalityRateU5", "Immunisation"]]
target = df['LifeExpectancy']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 42)

In [None]:
# Printing the shape of the training and testing sets

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

In [None]:
# Training the Logistic Regression Model to predict 'rating' bsed on other features

clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
# Evaluating the accuracy of the model

y_pred = clf.predict(X_test)
print("Accuracy of logistic regression: ", accuracy_score(y_test, y_pred))

## 10. Training a KNN model to predict LifeExpectancy

In [None]:
# Importing the KNN model package

from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Separating dataframe into data and target

data = df.drop(columns = ['LifeExpectancy'])
target = df['LifeExpectancy']

In [None]:
# Splitting the dataset

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state=142)

In [None]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

In [None]:
# Training the KNN model - performance of model is highly dependent on k-value - a k-value of 3 has been chosen

clf_knn = KNeighborsClassifier(n_neighbors=3)
clf_knn.fit(X_train, y_train)

In [None]:
# Evaluating the accuracy of the KNN model

y_pred = clf_knn.predict(X_test)
print("Accuracy of KNN: ", accuracy_score(y_test, y_pred))

## 11. Optimising KNN by tuning hyperparameter k

In [None]:
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt

In [None]:
# Don't need to train a model or test accuracy because cross validation does it for us

cv_scores = []
cv_scores_std = []
k_range = range(1, 150, 5) # change last number to 1 to iterate every value
for i in k_range:
    clf = KNeighborsClassifier(n_neighbors = i)
    scores = cross_val_score(clf, data, target, scoring='accuracy', cv=KFold(n_splits=10, shuffle=True))
    cv_scores.append(scores.mean())
    cv_scores_std.append(scores.std())

# Plot the relationship
plt.errorbar(k_range, cv_scores, yerr=cv_scores_std, marker='x', label='Accuracy')
plt.ylim([0.1, 1.1])
plt.xlabel('$K$')
plt.ylabel('Accuracy')
plt.xlim(0, 150)
plt.ylim(0, 0.2)
plt.legend(loc='best')
plt.show()

# vertical bar shows standard deviation
# we want the standard deviation to be as short as possible
# accuracy of the model can vary greatly - stableness or robustness
# need to find optimal hyper-parameter manually by observation - or use gridsearch cv

In [None]:
# Importing GridSearch

from sklearn.model_selection import GridSearchCV

In [None]:
# Using GridSearch to find the optimal value of the hyper-parameter K - Optimal value will be selected within 1 - 100

parameters = {'n_neighbors': range(1, 150)}
knn = KNeighborsClassifier()
gridSearch = GridSearchCV(knn, parameters, scoring='accuracy', cv=KFold(n_splits=10, shuffle=True))
gridSearch.fit(data, target)

In [None]:
# Finding the optimal K value

gridSearch.best_params_

In [None]:
# Don't need to train a model or test accuracy because cross validation does it for us

cv_scores = []
cv_scores_std = []
k_range = range(130, 150, 1) # change last number to 1 to iterate every value
for i in k_range:
    clf = KNeighborsClassifier(n_neighbors = i)
    scores = cross_val_score(clf, data, target, scoring='accuracy', cv=KFold(n_splits=10, shuffle=True))
    cv_scores.append(scores.mean())
    cv_scores_std.append(scores.std())

# Plot the relationship
plt.errorbar(k_range, cv_scores, yerr=cv_scores_std, marker='x', label='Accuracy')
plt.ylim([0.1, 1.1])
plt.xlabel('$K$')
plt.ylabel('Accuracy')
plt.xlim(130, 150)
plt.ylim(0, 0.2)
plt.legend(loc='best')
plt.show()

In [None]:
# Finding the best accuracy score of the model

gridSearch.best_score_

## 8. Gaussian Naive Bayes

In [None]:
# Importing GaussianNB and make_classification packages

from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB

In [None]:
# Create and training a Gaussian Naive Bayes classifier model
clf = GaussianNB()
clf.fit(X_train, y_train)

# Use the model to predict testing data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print('Testing accuracy is: %.4f\n' % accuracy)

In [None]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(X_train, y_train)
print(nbModel_grid.best_estimator_)

In [None]:
# Use 10-fold cross validation to show a more robust prediction accuracy
clf = GaussianNB()
scores = cross_val_score(clf, X, y, scoring='accuracy', cv=100)
print('Gaussian Naive Bayes accuracy range: [%.4f, %.4f]; mean: %.4f; std: %.4f\n' % (scores.min(), scores.max(), scores.mean(), scores.std()))