In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
url = ('https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae')
df = pd.read_csv(url)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df.head(5)

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


In [None]:
df.shape

(72186, 12)

In [None]:
df.columns

Index(['country', 'year', 'country_code', 'record', 'crop_land',
       'grazing_land', 'forest_land', 'fishing_ground', 'built_up_land',
       'carbon', 'total', 'QScore'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72186 entries, 0 to 72185
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         72186 non-null  object 
 1   year            72186 non-null  int64  
 2   country_code    72186 non-null  int64  
 3   record          72186 non-null  object 
 4   crop_land       51714 non-null  float64
 5   grazing_land    51714 non-null  float64
 6   forest_land     51714 non-null  object 
 7   fishing_ground  51713 non-null  float64
 8   built_up_land   51713 non-null  float64
 9   carbon          51713 non-null  float64
 10  total           72177 non-null  float64
 11  QScore          72185 non-null  object 
dtypes: float64(6), int64(2), object(4)
memory usage: 6.6+ MB


#To check Distribution of Target variable



In [None]:
df['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1A       16
1B       16
Name: QScore, dtype: int64

In [None]:
df.isnull().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

#for simplicity, we will drop the rows with missing values.

In [None]:
#Drop missing values
df = df.dropna()

In [None]:
df.isna().sum()

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

#An obvious change in our target variable after removing the missing values is that there are only three classes left 

In [None]:
df['QScore'].value_counts()

3A    51473
2A      224
1A       16
Name: QScore, dtype: int64

#and from the distribution of the 3 classes, we can see that there is an obvious imbalance between the classes.

#There are methods that can be applied to handle this imbalance such as oversampling and undersampling.

#Oversampling involves increasing the number of instances in the class with fewer instances while

#undersampling involves reducing the data points in the class with more instances.


#For now, we will convert this to a binary classification problem by combining class '2A' and '1A'.

In [None]:
df['QScore'] = df['QScore'].replace(['1A'], '2A') #oversampling as we are increasing 2A with 1A values

In [None]:
df.QScore.value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [None]:
df_2A = df[df.QScore=='2A']
df_3A = df[df.QScore=='3A'].sample(350) #undersampling as we are reducing 3A to 350
data_df = df_2A.append(df_3A) 

In [None]:
data_df.head(5)

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
1536,Algeria,2016,4,AreaPerCap,0.2072989,0.8112722,0.048357265,0.022585,0.02998367,0.0,1.119497,2A
1537,Algeria,2016,4,AreaTotHA,8417600.0,32942600.0,1963600.0,917100.0,1217520.0,0.0,45458420.0,2A
1538,Algeria,2016,4,BiocapPerCap,0.2021916,0.2636077,0.027166736,0.007948,0.02924496,0.0,0.530159,2A
1539,Algeria,2016,4,BiocapTotGHA,8210214.0,10704080.0,1103135.245,322736.9162,1187524.0,0.0,21527690.0,2A
1540,Algeria,2016,4,EFConsPerCap,0.6280528,0.1810332,0.162800822,0.014729,0.02924496,1.391455,2.407316,2A


In [None]:
data_df.tail(5)

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
37779,Madagascar,1990,129,BiocapTotGHA,4048800.0,26021600.0,19395296.36,4809580.0,572399.2292,0.0,54847680.0,3A
43024,Moldova,2007,146,AreaPerCap,0.5146961,0.08719557,0.090150536,0.02325215,0.029992,0.0,0.7452865,3A
25970,Guinea,2005,90,BiocapPerCap,0.3704488,0.8973333,0.795496622,0.3634982,0.038674,0.0,2.465451,3A
13689,Congo,2014,46,AreaTotHA,627000.0,10000000.0,22349400.0,834900.0,158557.9987,0.0,33969860.0,3A
14601,Cuba,2015,49,AreaTotHA,3405987.0,2834313.0,3200000.0,6074800.0,342278.0151,0.0,15857380.0,3A


In [None]:
import sklearn.utils
#used to shuffle data

In [None]:
data_df = sklearn.utils.shuffle(data_df) #shuffle the datas so 2a and 3a are randomly missed


In [None]:
data_df = data_df.reset_index(drop=True) #reset the table so it start from 0, 1, 2, 3..., instead of random nos
data_df.head(3)


Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Iraq,2016,103,EFProdTotGHA,4605966.0,626123.2886,86606.15132,46690.87196,1046806.0,49244120.0,55656310.0,2A
1,Tajikistan,2016,208,EFProdPerCap,0.282995,0.128678,0.003938652,0.0002,0.09446197,0.2141098,0.7243837,2A
2,Singapore,1975,200,EFConsPerCap,0.3892345,0.097126,0.362438654,0.280494,0.01397892,0.4795226,1.622795,3A


In [None]:
data_df.shape
#the size of our data Has been reduced due to undersampling of 3A

(590, 12)

In [None]:
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [None]:
data_df = data_df.drop(columns=['country_code', 'country', 'year']) #these columns are not needed for our machine learning 


In [None]:
features= data_df.drop(columns='QScore') # qscore was drop to make it our target
target = data_df['QScore']

#split the data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

In [None]:
y_train.value_counts()

3A    244
2A    169
Name: QScore, dtype: int64

#There is still an imbalance in the class distribution. For this, we use SMOTE only on the training data to handle this.




#encode categorical variable to numerical variables
Label Encoding : preprocess/ encode categorical data in numbers(numerical datas) as categorical datas cannot be modelled as the machine learning algorithm only understands numbers

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.transform(x_test.record)

#Balancing the data using SMOTE only on the training data to handle this.

In [None]:
import imblearn
from imblearn.over_sampling import SMOTE


In [None]:
smote = SMOTE(random_state=1)
x_train_balanced, y_balanced = smote.fit_resample(x_train, y_train)

Scaling is done to make data that maybe far from each other closer to each other

minmax scaler transform datas by scaling each of them to a given range

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns=['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train_balanced.drop(columns=['record']).columns)
normalised_train_df['record'] = x_train_balanced['record']

In [None]:
x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_test.drop(columns=['record']).columns)
normalised_test_df['record'] = x_test['record']

#Logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
log_reg = LogisticRegression()
log_reg.fit(normalised_train_df, y_balanced)


LogisticRegression()

In [None]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression()