# Machine Learning: Classification - Managing the Quality Metric of Global Ecological Footprint

# Stage-C-Lesson-1

In [2]:
# Import important libraries
import pandas as pd
import numpy as np

In [3]:
# Url of the dataset
url = 'https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae'

# read the dataset 
df = pd.read_csv(url, low_memory=False)
# then save the df to the local machine for offline use
df.to_csv('ecological_footprint_data.csv', index=False)

In [3]:
# Load the local csv file
data = pd.read_csv('ecological_footprint_data.csv', low_memory=False)
# Explore the first five rows
data.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


# Linear Classification and Logistic Regression

In [4]:
# Check the distribution of target variable (QScore)
data['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1B       16
1A       16
Name: QScore, dtype: int64

In [5]:
# Check missing values of each column
data.isna().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [6]:
# For simplicity, we will drop the rows with missing values
data = data.dropna()
# Check again missing values of columns
data.isna().sum()

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

An obvious change in our target variable after removing the missing values is that there are only three classes left and from the distribution of the 3 classes, we can see that there is an obvious imbalance between the classes. There are methods that can be applied to handle this imbalance such as oversampling and undersampling.

Oversampling involves increasing the number of instances in the class with fewer instances while undersampling involves reducing the data points in the class with more instances.


In [7]:
data['QScore'].value_counts()

3A    51473
2A      224
1A       16
Name: QScore, dtype: int64

For now, we will convert this to a binary classification problem by combining class '2A' and '1A'.

In [8]:
data['QScore'] = data['QScore'].replace(['1A'], ['2A'])
data['QScore'].value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [9]:
df_2A = data[data.QScore == '2A']
df_3A = data[data.QScore == '3A'].sample(350)

data_df = df_2A.append(df_3A)
data_df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
1536,Algeria,2016,4,AreaPerCap,0.2072989,0.8112722,0.048357265,0.022585,0.02998367,0.0,1.119497,2A
1537,Algeria,2016,4,AreaTotHA,8417600.0,32942600.0,1963600.0,917100.0,1217520.0,0.0,45458420.0,2A
1538,Algeria,2016,4,BiocapPerCap,0.2021916,0.2636077,0.027166736,0.007948,0.02924496,0.0,0.530159,2A
1539,Algeria,2016,4,BiocapTotGHA,8210214.0,10704080.0,1103135.245,322736.9162,1187524.0,0.0,21527690.0,2A
1540,Algeria,2016,4,EFConsPerCap,0.6280528,0.1810332,0.162800822,0.014729,0.02924496,1.391455,2.407316,2A


In [10]:
import sklearn.utils
data_df = sklearn.utils.shuffle(data_df, random_state=0)
data_df = data_df.reset_index(drop=True)
data_df.shape
data_df['QScore'].value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

# Preprocessing the data and separate the target variable from feature variables

In [11]:
data_df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Trinidad and Tobago,2016,220,AreaTotHA,47000.0,7000.0,236100.0,2113600.0,1291.78,0.0,2404992.0,2A
1,Belize,2016,23,EFProdPerCap,0.5071023,0.2055006,0.20984546,2.222697,0.000554051,0.4805086,3.626208,2A
2,Guatemala,2016,89,EFConsTotGHA,7289958.0,2665859.0,9354954.762,589403.7,1496289.0,9760726.0,31157190.0,2A
3,Libyan Arab Jamahiriya,2006,124,EFConsPerCap,0.6178304,0.3286547,0.116582309,0.04430471,0.01642221,2.200892,3.324687,3A
4,Kyrgyzstan,2016,113,EFConsPerCap,0.4271284,0.1977978,0.059306959,0.005479879,0.07480447,0.8907022,1.65522,2A


In [12]:
data_df = data_df.drop(columns=['country', 'year', 'country_code'])
len(data_df)

590

In [13]:
X = data_df.drop(columns='QScore')
y = data_df.QScore


In [14]:
# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [15]:
# Count the y_train
y_train.value_counts()

3A    248
2A    165
Name: QScore, dtype: int64

There is still an imbalance in the class distribution. For this, we use SMOTE 
only on the training data to handle this.

In other way, there are categorical variables available, so that, we use LabelEncoder to encode categorical variables.

Scikit-learn has a LabelEncoder class that can be used to get label encodings. We loop over the categorical variables and apply the label encoder separately to each column if we have more than one. But now we have only one categorical variable to encode is 'record'

In [16]:
# First, we should have to get list of categorical variables automatically to encode if any
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

X_train['forest_land'].dtype

dtype('O')

In [17]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
# Encode categorical variable and transform it
X_train.record = encoder.fit_transform(X_train.record)
X_test.record = encoder.transform(X_test.record)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [18]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
X_train_balanced, y_train_balanced = smote.fit_sample(X_train, y_train)


In [19]:
# Scale using Min - Max scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

normalised_train_df = scaler.fit_transform(X_train_balanced.drop(columns=['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=X_train_balanced.drop(columns=['record']).columns)
normalised_train_df['record'] = X_train_balanced['record']


In [20]:
# normalize the x_test using min-max scaler

X_test = X_test.reset_index(drop=True)
normalised_test_df = scaler.transform(X_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=X_test.drop(columns=['record']).columns)
normalised_test_df['record'] = X_test['record']

In [21]:
# Create LogisticRegression Model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
# Fit and train the model
log_reg.fit(normalised_train_df, y_train_balanced)

# Check
#log_reg.__dict__


LogisticRegression()

# Stage-C-Lesson 2
# Measuring Classification Performance

Cross Validation techniques to evaluate the performance of the model

In [22]:
# Performing cross validation
# Import libararies
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, normalised_train_df, y_train_balanced, cv=5, scoring='f1_macro')

# printing scores
print(scores)

# Print average score
print('Average score', round(scores.mean()*100, 3))


[0.4887218  0.48091696 0.39583583 0.56494635 0.53530612]
Average score 49.315


Confusion Matrix, Precision-Recall, ROC curve and the F1-score

In [23]:
# Confusiin Matrix
new_predictions = log_reg.predict(normalised_test_df)
cnf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=['2A', '3A'])
cnf_mat

array([[41, 34],
       [57, 45]])

In [24]:
# Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print('Accuracy: {}'.format(round(accuracy*100), 2))

Accuracy: 49.0


In [29]:
# Precision
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Precision: {}'.format(round(precision*100), 2)) #prints 41.0
precision

Precision: 42.0


0.41836734693877553

In [30]:
# Recall
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Recall: {}'.format(round(recall*100), 2)) #prints 51.0 

Recall: 55.0


In [31]:
# F1-Score
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('F1: {}'.format(round(f1*100), 2)) #prints 45.0


F1: 47.0


In [32]:
# K-Fold Cross Validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=5) # the training data is split into 5 equal groups
kf.split(normalised_train_df)

f1_scores = []
# run for every split
for train_index, test_index in kf.split(normalised_train_df):
    x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
    y_train, y_test = y_train_balanced.iloc[train_index], y_train_balanced.iloc[test_index]
    
    model = LogisticRegression().fit(x_train, y_train)
    # Append result to the list
    f1_scores.append(f1_score(y_true = y_test, y_pred=model.predict(x_test), pos_label='2A')*100)

# print scores
print(f1_scores)

# Print average score
print('Average score', round(np.array(f1_scores).mean(), 3))
    

[58.39416058394161, 66.66666666666667, 52.23880597014925, 51.162790697674424, 0.0]
Average score 45.692


In [33]:
# Using StratifiedKFold Cross Validation
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

f1_scores = []

# run for every split
for train_index, test_index in skf.split(normalised_train_df, y_train_balanced):
    x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index]
    y_train, y_test = y_train_balanced[train_index], y_train_balanced[test_index]
    
    model = LogisticRegression().fit(x_train, y_train)
    # Save the result to list
    f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A')*100)

# print f1_scores
print(f1_scores)

# Print average score
print('Average score', round(np.array(f1_scores).mean(), 3))
    

[51.85185185185185, 60.0, 50.505050505050505, 50.943396226415096, 58.2089552238806]
Average score 54.302


In [34]:
# Leave One Out cross validation(LOOCV)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_train_balanced, cv=loo, scoring='f1_macro')

# Print average score
print('Average score', round(scores.mean()*100, 3))

Average score 44.355


# Stage-C-Lesson 4
Tree-Based Methods and The Support Vector Machine

In [35]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
# Fit the model
dec_tree.fit(normalised_train_df, y_train_balanced)

DecisionTreeClassifier()