In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('NFA 2019 public_data.csv');

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
df.head(5)

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


In [3]:
#df.columns

#we check the distribution of target variable
df['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1A       16
1B       16
Name: QScore, dtype: int64

In [4]:
#check the number of missing values
df.isna().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [5]:
#for simplicity, we will drop the rows with missing values
df = df.dropna()
df.isna().sum() #and display

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

In [6]:
df["QScore"].value_counts()

3A    51473
2A      224
1A       16
Name: QScore, dtype: int64

In [7]:
#An obvious change in our target variable after removing the missing values is that there are only three classes left
#and from the distribution of the 3 classes, we can see that there is an obvious imbalance between the classes.
#There are methods that can be applied to handle this imbalance such as oversampling and undersampling.
#Oversampling involves increasing the data points in the class with fewer instances.
#while undersampling involves reducing the data points in the class with more instances
#For now, we will convert this to a binary classification problem by combining class "2A" and "1A"

df["QScore"] = df["QScore"].replace(['1A'], '2A')
df.QScore.value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [8]:
df_2A = df[df.QScore=="2A"]
df_3A = df[df.QScore=="3A"].sample(350)
data_df = df_2A.append(df_3A)

In [9]:
import sklearn.utils
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop=True)
data_df.shape
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [10]:
#more processing
data_df = data_df.drop(columns=['country_code', 'country', 'year'])
x = data_df.drop(columns='QScore')
y = data_df['QScore']

#split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
y_train.value_counts()

3A    246
2A    167
Name: QScore, dtype: int64

In [11]:
#There is still an imbalance in the class distribution. For this, we use SMOTE only on the training data to handle this

#encode categorical variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.transform(x_test.record)

In [12]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
x_train_balanced, y_balanced = smote.fit_sample(x_train, y_train)

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns=['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train_balanced.drop(columns=['record']).columns)
normalised_train_df['record'] = x_train_balanced['record']


In [14]:
x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_test.drop(columns=['record']).columns)
normalised_test_df['record'] = x_test['record']

LOGISTIC REGRESSION

In [15]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(normalised_train_df, y_balanced)

#returns=
'''
LogisticRegression(C= 1.0 , class_weight= None , dual= False , fit_intercept= True ,
 intercept_scaling= 1 , l1_ratio= None , max_iter= 100 ,
 multi_class= 'auto' , n_jobs= None , penalty= 'l2' ,
 random_state= None , solver= 'lbfgs' , tol= 0.0001 , verbose= 0 ,
 warm_start= False ) 
'''

"\nLogisticRegression(C= 1.0 , class_weight= None , dual= False , fit_intercept= True ,\n intercept_scaling= 1 , l1_ratio= None , max_iter= 100 ,\n multi_class= 'auto' , n_jobs= None , penalty= 'l2' ,\n random_state= None , solver= 'lbfgs' , tol= 0.0001 , verbose= 0 ,\n warm_start= False ) \n"

MEASURING CLASSIFICATION PERFORMANCE

In [16]:
#Cross-validation and accuracy
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, normalised_train_df, y_balanced, cv=5, scoring='f1_macro')
scores

array([0.51391162, 0.49661017, 0.61763155, 0.51278409, 0.44690635])

In [17]:
#Confusion Matrix
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
new_predictions = log_reg.predict(normalised_test_df)
cnf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=[ '2A' , '3A' ])
cnf_mat

array([[39, 34],
       [54, 50]], dtype=int64)

In [18]:
#Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print( 'Accuracy: {}' .format(round(accuracy* 100 ), 2 ))

Accuracy: 50


In [19]:
#precision
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'Precision: {}' .format(round(precision* 100 ), 2 )) 

Precision: 42


In [20]:
#Recall
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'Recall: {}' .format(round(recall* 100 ), 2 ))

Recall: 53


In [21]:
#F1-score
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'F1: {}' .format(round(f1* 100 ), 2 )) 

F1: 47


In [22]:
from sklearn.metrics import f1_score


#K-Fold Cross Validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.split(normalised_train_df)
f1_scores = []

#run for every split
for train_index, test_index in kf.split(normalised_train_df):
    x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
    y_train, y_test = y_balanced[train_index], y_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
    
#save result to list
f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A')*100)

In [23]:
# Stratified k-Fold Cross Validation
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
f1_scores = []

# run for every split
for train_index, test_index in skf.split(normalised_train_df, y_balanced):
    x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index]
    y_train, y_test = y_balanced[train_index], y_balanced[test_index]
    
    model = LogisticRegression().fit(x_train, y_train)
# save result to list
f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A'))


In [24]:
#Leave One Out Cross Validation (LOOCV)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo, scoring='f1_macro')
average_score = scores.mean()*100