<a href="https://colab.research.google.com/github/ProsperChuks/logistic-regression/blob/main/log_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [30]:
dataf = pd.read_csv('https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae')
dataf.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


Preparing the Data

In [31]:
print(dataf['QScore'].value_counts())
print('\n')
print(dataf.isnull().sum())
print('\n')
dataf = dataf.dropna()
print(dataf.isnull().sum())
print('\n')
print(dataf['QScore'].value_counts())

3A    51481
2A    10576
2B    10096
1B       16
1A       16
Name: QScore, dtype: int64


country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64


country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64


3A    51473
2A      224
1A       16
Name: QScore, dtype: int64


used oversampling to correct the imbalance between the classes - making it a binary classification problem

In [32]:
dataf['QScore'] = dataf['QScore'].replace(['1A'], '2A')
dataf['QScore'].value_counts()

df_2A = dataf[dataf.QScore == '2A']
df_3A = dataf[dataf.QScore == '3A'].sample(350)
class_df = df_2A.append(df_3A)

import sklearn.utils
class_df = sklearn.utils.shuffle(class_df)
class_df = class_df.reset_index(drop=True)
class_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

preparing the dataset to be trained

In [33]:
class_df = class_df.drop(columns=['country_code', 'country', 'year'])
X = class_df.drop(columns=['QScore'])
y = class_df['QScore']

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train.value_counts()

3A    249
2A    164
Name: QScore, dtype: int64

normalizing the data

In [46]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.fit_transform(x_test.record)

import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
x_train_balanced, y_balanced = smote.fit_sample(x_train, y_train)

x_train_balanced = pd.DataFrame(x_train_balanced, columns=x_train.columns)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_train = scaler.fit_transform(x_train_balanced.drop(columns=['record']))
normalized_train = pd.DataFrame(normalized_train, columns=[x_train_balanced.drop(columns=['record']).columns])
normalized_train['record'] = x_train_balanced['record']

x_test = x_test.reset_index(drop=True)
normalized_test = scaler.fit_transform(x_test.drop(columns=['record']))
normalized_test = pd.DataFrame(normalized_test, columns=x_test.drop(columns=['record']).columns)
normalized_test['record'] = x_test['record']