In [1348]:
# This is my COMP2200 Data Science Portfolio 4 


## Data Science Portfolio Part 4

The goal of the second analysis task is to train linear regression models to predict users' ratings towards items. This involves a standard Data Science workflow: exploring data, building models, making predictions, and evaluating results. In this task, we will explore the impacts of feature selections and different sizes of training/testing data on the model performance. We will use another cleaned combined e-commerce sub-dataset that **is different from** the one in “Analysis of an E-commerce Dataset” task 1.

### Import Cleaned E-commerce Dataset
The csv file named 'cleaned_ecommerce_dataset.csv' is provided. You may need to use the Pandas method, i.e., `read_csv`, for reading it. After that, please print out its total length.

### Explore the Dataset

* Use the methods, i.e., `head()` and `info()`, to have a rough picture about the data, e.g., how many columns, and the data types of each column.
* As our goal is to predict ratings given other columns, please get the correlations between helpfulness/gender/category/review and rating by using the `corr()` method.
* To get the correlations between different features, you may need to first convert the categorical features (i.e., gender, category and review) into numerial values. For doing this, you may need to import `OrdinalEncoder` from `sklearn.preprocessing` (refer to the useful exmaples [here](https://pbpython.com/categorical-encoding.html))
* Please provide ___necessary explanations/analysis___ on the correlations, and figure out which are the ___most___ and ___least___ corrleated features regarding rating. Try to ___discuss___ how the correlation will affect the final prediction results, if we use these features to train a regression model for rating prediction. In what follows, we will conduct experiments to verify your hypothesis.

In [1349]:
# Import Library   

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import r2_score

import matplotlib.pylab as plt
%matplotlib inline


# Import ignore warnings 

import warnings
warnings.filterwarnings('ignore')

In [1350]:
# Load World Development Indicators data into this notebook

df = pd.read_csv("WorldDevelopmentIndicators.csv")

## 1. Exploring the dataset

In [1351]:
# Printing the length of the dataset before cleaning

print('Length of the dataset before cleaning: ', len(df))

In [1352]:
# Displaying the dataset using the method head()

df.head()

In [1353]:
# Displaying the dataframe before cleaning

df

In [1354]:
# Displaying a summary of the dataframe

df.info()

In [1355]:
# Printing the number of countries in the dataset

print('Number of countries:', len(df['Country'].unique()))

In [1356]:
# Printing the number of years the data has been collected across

print('Number of years:', len(df['Year'].unique()))

In [1357]:
# Displaying the dimensions of the dataset before cleaning

df.shape

In [1358]:
# Displaying descriptive statistics for Life Expectancy

df['LifeExpectancy'].describe()

In [1359]:
# Displaying the country with the lowest life expectancy across all years

LifeExpectancyMin = df['LifeExpectancy'].min()
print(df.loc[df['LifeExpectancy'] == LifeExpectancyMin, ['Country', 'Year', 'LifeExpectancy']])

In [1360]:
# Displaying the country with the highest life expectancy across all years

LifeExpectancyMax = df['LifeExpectancy'].max()
print(df.loc[df['LifeExpectancy'] == LifeExpectancyMax, ['Country', 'Year', 'LifeExpectancy']])

In [1361]:
# Displaying the country with the lowest life expectancy in 2020

Data2020 = df.loc[df['Year'] == 2020]
LifeExpectancyMin2020 = Data2020['LifeExpectancy'].min()
print(df.loc[df['LifeExpectancy'] == LifeExpectancyMin2020, ['Country', 'Year', 'LifeExpectancy']])

In [1362]:
# Displaying the country with the highest life expectancy in 2020

Data2020 = df.loc[df['Year'] == 2020]
LifeExpectancyMax2020 = Data2020['LifeExpectancy'].max()
print(df.loc[df['LifeExpectancy'] == LifeExpectancyMax2020, ['Country', 'Year', 'LifeExpectancy']])

## 2. Cleaning the dataset

In [1363]:
# Counting the number of null values in each column

print('Number of null values in Country:', df['Country'].isna().sum())
print('Number of null values in Year:', df['Year'].isna().sum())
print('Number of null values in Agriculture:', df['Agriculture'].isna().sum())
print('Number of null values in Exports:', df['Exports'].isna().sum())
print('Number of null values in FertilityRate:', df['FertilityRate'].isna().sum())
print('Number of null values in GDP:', df['GDP'].isna().sum())
print('Number of null values in Immunisation:', df['Immunisation'].isna().sum())
print('Number of null values in Imports:', df['Imports'].isna().sum())
print('Number of null values in Industry:', df['Industry'].isna().sum())
print('Number of null values in Inflation:', df['Inflation'].isna().sum())
print('Number of null values in MerchandiseTrade:', df['MerchandiseTrade'].isna().sum())
print('Number of null values in MilitaryExpenditure:', df['MilitaryExpenditure'].isna().sum())
print('Number of null values in MortalityRateU5:', df['MortalityRateU5'].isna().sum())
print('Number of null values in NetMigration:', df['NetMigration'].isna().sum())
print('Number of null values in DevelopmentAssistanceAndAid:', df['DevelopmentAssistanceAndAid'].isna().sum())
print('Number of null values in PopulationDensity:', df['PopulationDensity'].isna().sum())
print('Number of null values in PopulationGrowth:', df['PopulationGrowth'].isna().sum())
print('Number of null values in PrimarySchoolEnrollment:', df['PrimarySchoolEnrollment'].isna().sum())
print('Number of null values in UrbanPopulationGrowth:', df['UrbanPopulationGrowth'].isna().sum())
print('Number of null values in LifeExpectancy:', df['LifeExpectancy'].isna().sum())

In [1364]:
# Removing records where values are missing

clean_df = df.dropna(subset=['Agriculture', 
                             'Exports', 
                             'FertilityRate', 
                             'GDP', 
                             'Immunisation', 
                             'Imports', 
                             'Industry', 
                             'Inflation', 
                             'MerchandiseTrade', 
                             'MilitaryExpenditure', 
                             'MortalityRateU5', 
                             'NetMigration', 
                             'DevelopmentAssistanceAndAid', 
                             'PopulationDensity', 
                             'PopulationGrowth', 
                             'PrimarySchoolEnrollment', 
                             'UrbanPopulationGrowth', 
                             'LifeExpectancy'])

In [1365]:
# Printing the length of the dataset after cleaning

print('Length of the dataset after cleaning: ', len(clean_df))

In [1366]:
# Displaying a summary of the cleaned dataset

clean_df.info()

In [1424]:
# Counting the number of null values in each column after cleaning

print('Number of null values in Country:', clean_df['Country'].isna().sum())
print('Number of null values in Year:', clean_df['Year'].isna().sum())
print('Number of null values in Agriculture:', clean_df['Agriculture'].isna().sum())
print('Number of null values in Exports:', clean_df['Exports'].isna().sum())
print('Number of null values in FertilityRate:', clean_df['FertilityRate'].isna().sum())
print('Number of null values in GDP:', clean_df['GDP'].isna().sum())
print('Number of null values in Immunisation:', clean_df['Immunisation'].isna().sum())
print('Number of null values in Imports:', clean_df['Imports'].isna().sum())
print('Number of null values in Industry:', clean_df['Industry'].isna().sum())
print('Number of null values in Inflation:', clean_df['Inflation'].isna().sum())
print('Number of null values in MerchandiseTrade:', clean_df['MerchandiseTrade'].isna().sum())
print('Number of null values in MilitaryExpenditure:', clean_df['MilitaryExpenditure'].isna().sum())
print('Number of null values in MortalityRateU5:', clean_df['MortalityRateU5'].isna().sum())
print('Number of null values in NetMigration:', clean_df['NetMigration'].isna().sum())
print('Number of null values in DevelopmentAssistanceAndAid:', clean_df['DevelopmentAssistanceAndAid'].isna().sum())
print('Number of null values in PopulationDensity:', clean_df['PopulationDensity'].isna().sum())
print('Number of null values in PopulationGrowth:', clean_df['PopulationGrowth'].isna().sum())
print('Number of null values in PrimarySchoolEnrollment:', clean_df['PrimarySchoolEnrollment'].isna().sum())
print('Number of null values in UrbanPopulationGrowth:', clean_df['UrbanPopulationGrowth'].isna().sum())
print('Number of null values in LifeExpectancy:', clean_df['LifeExpectancy'].isna().sum())

In [1368]:
# Displaying the dataframe after cleaning

clean_df

In [1369]:
# Printing the shape of the dataset after cleaning 

clean_df.shape

## 3. Removing outliers

In [1370]:
# Printing the length of the dataset after before removing outliers

print('Length of the dataset before removing outliers: ', len(clean_df))

In [1371]:
# Printing the number of records in each 5 year timespan (and prior to 1980)

print('Number of records in 1960-1979:', len(clean_df[(clean_df.Year >= 1960) & (clean_df.Year < 1980)]))
print('Number of records in 1980-1984:', len(clean_df[(clean_df.Year >= 1980) & (clean_df.Year < 1985)]))
print('Number of records in 1985-1989:', len(clean_df[(clean_df.Year >= 1985) & (clean_df.Year < 1990)]))
print('Number of records in 1990-1994:', len(clean_df[(clean_df.Year >= 1990) & (clean_df.Year < 1995)]))
print('Number of records in 1995-1999:', len(clean_df[(clean_df.Year >= 1995) & (clean_df.Year < 2000)]))
print('Number of records in 2000-2004:', len(clean_df[(clean_df.Year >= 2000) & (clean_df.Year < 2005)]))
print('Number of records in 2005-2009:', len(clean_df[(clean_df.Year >= 2005) & (clean_df.Year < 2010)]))
print('Number of records in 2010-2014:', len(clean_df[(clean_df.Year >= 2010) & (clean_df.Year < 2015)]))
print('Number of records in 2015-2019:', len(clean_df[(clean_df.Year >= 2015) & (clean_df.Year < 2020)]))
print('Number of records in 2020-2024:', len(clean_df[(clean_df.Year >= 2020) & (clean_df.Year < 2025)]))

In [1372]:
# Identifying records collected prior to 1985

before1985 = (clean_df['Year']) < 1985

In [1373]:
# Removing records collected prior to 1985

clean_df_2 = clean_df.drop(clean_df.index[before1985])

In [1374]:
# Printing the length of the dataset after removing outliers

print('Length of the dataset after removing outliers: ', len(clean_df_2))

In [1375]:
# Saving the cleaned dataset as a new file

clean_df_2.to_csv('WorldDevelopmentIndicatorsClean.csv', index=False)

# 4. Exploring cleaned dataset

In [1376]:
df = pd.read_csv("WorldDevelopmentIndicatorsClean.csv")

In [1377]:
# Printing the length of the dataset before cleaning

print('Length of the cleaned WorldDevelopmentIndicators dataset: ', len(df))

In [1378]:
# Displaying the dataframe of the cleaned WorldDevelopmentIndicators dataset

df

In [1379]:
# Displaying a summary of the dataframe

df.info()

In [1380]:
# Printing the number of countries in the dataset

print('Number of countries:', len(df['Country'].unique()))

In [1381]:
# Printing the number of years the data has been collected across

print('Number of years:', len(df['Year'].unique()))

In [1382]:
# Displaying the number of records for each country

df.groupby(['Country'])['LifeExpectancy'].count().reset_index(name='Count')

In [1383]:
# Displaying the number of records for each country in ascending order

(df.groupby(['Country'])['LifeExpectancy'].count().reset_index(name='Count')).sort_values("Count")

In [1384]:
# Displaying descriptive statistics for Life Expectancy

df['LifeExpectancy'].describe()

## 5. Plotting the dataset

In [1385]:
# Finding the average value for each Indicator each year to be graphed

byIndicator = df.groupby(['Year'])['Agriculture', 
            'Exports', 
            'FertilityRate', 
            'GDP', 
            'Immunisation', 
            'Imports', 
            'Industry', 
            'Inflation', 
            'MerchandiseTrade', 
            'MilitaryExpenditure', 
            'MortalityRateU5', 
            'NetMigration', 
            'DevelopmentAssistanceAndAid', 
            'PopulationDensity', 
            'PopulationGrowth', 
            'PrimarySchoolEnrollment', 
            'UrbanPopulationGrowth', 
            'LifeExpectancy'].mean()
byIndicator.head()

In [1386]:
# Plotting the average trends of each Indicator over time

plt.rcParams.update(plt.rcParamsDefault)

byIndicator.plot.line(subplots = True, figsize = (15, 30),sharex = True)
plt.style.use('classic')
plt.show()

In [1387]:
# Finding the average Life Expectancy for each Year each year to be graphed

byYear = df.groupby(['Year'])['LifeExpectancy'].mean().reset_index(name='MeanLifeExpectancy')
byYear.head()

In [1388]:
# Graphing life expectancy against year

plt.rcParams.update(plt.rcParamsDefault)

byYear.plot.line(x = 'Year', 
                 y = 'MeanLifeExpectancy', 
                 title = 'Line Graph of Life Expectancy against Year', 
                 color = 'indigo')
plt.style.use('classic')
plt.show()

In [1389]:
# Importing sns package

import seaborn as sns

In [1390]:
# Graphing Life expectancy and year by country

plt.rcParams.update(plt.rcParamsDefault)

sns.set(rc={'figure.figsize':(20,30)})
sns.scatterplot(data = df, 
                x = "Year", 
                y = "LifeExpectancy", 
                hue = "Country"
               #size="size", sizes=(20, 200), hue_norm=(0, 7), legend="full"
               )
plt.style.use('classic')
plt.title('Scatterplot of Life Expectancy by Year with Country')
plt.show()

## 6. Exploring the correlation of the dataset

In [1391]:
# Displaying the head of the dataset

df.head()

In [1392]:
# Importing ordinal encoder

from sklearn.preprocessing import OrdinalEncoder

In [1393]:
# Declaring ordinal encoder - datatype is integer

enc = OrdinalEncoder(dtype = int)

In [1394]:
# Converting 'Country' to numerical data - creating column 'CountryCode'

df['CountryCode'] = enc.fit_transform(df[['Country']])

In [1395]:
# Displaying the head of the dataset

df.head()

In [1396]:
# Displaying 'Country' with new code column 'CountryCode'

df[["Country", "CountryCode"]]

In [1397]:
# Removing records collected prior to 1985

df = df.drop(columns = ['Country'])

In [1398]:
# Displaying the head of the dataset

df.head()

In [1399]:
# Finding the correlations between all indicators

df[['Year',
    'CountryCode',
    'Agriculture', 
    'Exports', 
    'FertilityRate', 
    'GDP', 
    'Immunisation', 
    'Imports', 
    'Industry', 
    'Inflation', 
    'MerchandiseTrade', 
    'MilitaryExpenditure', 
    'MortalityRateU5', 
    'NetMigration', 
    'DevelopmentAssistanceAndAid', 
    'PopulationDensity', 
    'PopulationGrowth', 
    'PrimarySchoolEnrollment', 
    'UrbanPopulationGrowth',
    'LifeExpectancy']].corr()

In [1400]:
# Printing the correlation between all Indicators and Life expectancy

print('correlation between Year and LifeExpectancy:', df['Year'].corr(df['LifeExpectancy']))
print('correlation between CountryCode and LifeExpectancy:', df['CountryCode'].corr(df['LifeExpectancy']))
print('correlation between Agriculture and LifeExpectancy:', df['Agriculture'].corr(df['LifeExpectancy']))
print('correlation between Exports and LifeExpectancyy:', df['Exports'].corr(df['LifeExpectancy']))
print('correlation between FertilityRate and LifeExpectancy:', df['FertilityRate'].corr(df['LifeExpectancy']))
print('correlation between GDP and LifeExpectancy:', df['GDP'].corr(df['LifeExpectancy']))
print('correlation between Immunisation and LifeExpectancy:', df['Immunisation'].corr(df['LifeExpectancy']))
print('correlation between Imports and LifeExpectancy:', df['Imports'].corr(df['LifeExpectancy']))
print('correlation between Industry and LifeExpectancy:', df['Industry'].corr(df['LifeExpectancy']))
print('correlation between Inflation and LifeExpectancy:', df['Inflation'].corr(df['LifeExpectancy']))
print('correlation between MerchandiseTrade and LifeExpectancy:', df['MerchandiseTrade'].corr(df['LifeExpectancy']))
print('correlation between MortalityRateU5 and LifeExpectancy:', df['MortalityRateU5'].corr(df['LifeExpectancy']))
print('correlation between NetMigration and LifeExpectancy:', df['NetMigration'].corr(df['LifeExpectancy']))
print('correlation between DevelopmentAssistanceAndAid and LifeExpectancy:', df['DevelopmentAssistanceAndAid'].corr(df['LifeExpectancy']))
print('correlation between PopulationDensity and LifeExpectancy:', df['PopulationDensity'].corr(df['LifeExpectancy']))
print('correlation between PopulationGrowth and LifeExpectancy:', df['PopulationGrowth'].corr(df['LifeExpectancy']))
print('correlation between PrimarySchoolEnrollment and LifeExpectancy:', df['PrimarySchoolEnrollment'].corr(df['LifeExpectancy']))
print('correlation between UrbanPopulationGrowth and LifeExpectancy:', df['UrbanPopulationGrowth'].corr(df['LifeExpectancy']))

In [1401]:
# Using heatmap to show correlations

plt.figure(figsize=(15,10))
sns.heatmap(data= df.corr(), annot=True, cmap='viridis')
plt.show()

In [1402]:
# Defining X and Y
X = df.drop(['LifeExpectancy'], axis = 1)
y = df['LifeExpectancy']

In [1403]:
#Correlation with independent variable

X.corrwith(df.LifeExpectancy).plot.bar(figsize = (15, 10), title = "Correlation with LifeExpectancy", fontsize = 10,grid = True)

plt.show()

In [1404]:
# Custom correlation matrix

sns.set(style="white")

# Using df.corr() to set up the correlation matrix
corr = df.corr()

# Covering the upper diagonal of the matrix
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Setting up the figure
fig, ax = plt.subplots(figsize=(10, 10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(500, 40, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

## 7. Training logistic regression models

In [1405]:
# Changing Life expectancy to a categorical value so a logistic regression model can be built

df1 = df
df1['LifeExpectancy'] = df1['LifeExpectancy'].astype('int')

In [1406]:
df1.head()

In [1407]:
# Importing train test split package

from sklearn.model_selection import train_test_split

In [1408]:
# Splitting the dataset - testing size of 20% - randomstate = 42

train, test = train_test_split(df1, test_size = 0.2, random_state = 42)
print(train.shape)
print(test.shape)

In [1409]:
# Getting input data and targets for building prediction model

X_train = train.drop(['LifeExpectancy'], axis=1)
y_train = train['LifeExpectancy']
X_test = test.drop(['LifeExpectancy'], axis=1)
y_test = test['LifeExpectancy']

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

In [1410]:
# Importing the Logistic Regression Model

from sklearn.linear_model import LogisticRegression

In [1411]:
# Training Logistic Regression model

clf = LogisticRegression()
clf.fit(X_train, y_train)

In [1412]:
# Doing predictions on train and test set

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [1413]:
# Importing accuracy score and confusion matrix packages

from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay, accuracy_score 

In [1414]:
# Evaluate the performance of trained model

print("Accuracy score on training set: ", accuracy_score(y_train, y_pred_train))
print("Accuracy score on testing set: ", accuracy_score(y_test, y_pred_test))

# possibly discuss overfitting

In [1415]:
#Applying K-folds validation

from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator= clf, X=X_train, y=y_train, cv=10)
accuracies.mean()
accuracies.std()
print('Logistic Regression Accuracy: %0.3f (+/- %0.3f)' % (accuracies.mean(), accuracies.std() * 2))

In [1416]:
#Recursive feature elimination
from sklearn.feature_selection import RFE

In [1417]:
# RFE for logistic regression

# Select best feature 
rfe = (RFE(clf, n_features_to_select= None)).fit(X_train, y_train)

#Summarize the selection of the attributes
print(rfe.support_)

print(rfe.ranking_)

X_train.columns[rfe.support_]

In [1418]:
# Custom correlation matrix

sns.set(style="white")

# Using df.corr() to set up the correlation matrix
corr = X_train[X_train.columns[rfe.support_]].corr()

# Covering the upper diagonal of the matrix
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Setting up the figure
fig, ax = plt.subplots(figsize=(10, 10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(500, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

In [1419]:
# Fitting Model to the Training Set
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train[X_train.columns[rfe.support_]], y_train)

# Predicting Test Set
y_pred_test_2 = classifier.predict(X_test[X_train.columns[rfe.support_]])
accuracy_score(y_test, y_pred_test_2)



In [None]:
# Checking confusion matrix

print("Confusion matrix on test set: ")
print(confusion_matrix(y_train, y_pred_train))

In [1421]:
#Predicting Test set
y_pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score,recall_score,precision_score, confusion_matrix
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test, y_pred, average = None)
rec = recall_score(y_test, y_pred, average = None)
f1 = f1_score(y_test,y_pred, average = None)
results = pd.DataFrame([['Logistic Regression (Lasso)', acc,prec,rec,f1]],columns=['Model', 'Accuracy', 'Precision', 'Recall','F1 Score'])
results

In [1422]:
confusion_matrix(y_test, y_pred_test) # rows = truth, cols = prediction

In [1423]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test) # rows = truth, cols = prediction
df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)
sns.heatmap(df_cm, annot=True, fmt='g')
print("Test Data Accuracy: %0.4f" % accuracy_score(y_test, y_pred)) 

## 8. Using RFE to improve logistic regression model

In [None]:
# creating RFE object

lr_model = LogisticRegression()
rfe = RFE(estimator=lr_model, n_features_to_select=5, step=1)
rfe.fit(X_train, y_train)

In [None]:
# Training the Logistic Regression Model to predict 'Life expectancy' bsed on other features

clf = LogisticRegression()
clf.fit(X_train, y_train)