In [2]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
#pickle and joblib are used to save your trained model
import pickle
#from sklearn.externals import joblib
import joblib
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [3]:
data=pd.read_csv("Right-To-Work-Law.csv")
data

Unnamed: 0,City,COL,PD,URate,Pop,Taxes,Income,RTWL
0,Atlanta,169,414,13.6,1790128,5128,2961,1
1,Austin,143,239,11.0,396891,4303,1711,1
2,Bakersfield,339,43,23.7,349874,4166,2122,0
3,Baltimore,173,951,21.0,2147850,5001,4654,0
4,Baton Rouge,99,255,16.0,411725,3965,1620,1
5,Boston,363,1257,24.4,3914071,4928,5634,0
6,Buffalo,253,834,39.2,1326848,4471,7213,0
7,Champaign-Urbana,117,162,31.5,162304,4813,5535,0
8,Cedar Rapids,294,229,18.2,164145,4839,7224,1
9,Chicago,291,1886,31.5,7015251,5408,6113,0


In [4]:
data.isnull().sum()

City      0
COL       0
PD        0
URate     0
Pop       0
Taxes     0
Income    0
RTWL      0
dtype: int64

# Numerical features

In [5]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='object']

print("Number of Numeric Variables ",len(numerical_features))
data[numerical_features]

Number of Numeric Variables  7


Unnamed: 0,COL,PD,URate,Pop,Taxes,Income,RTWL
0,169,414,13.6,1790128,5128,2961,1
1,143,239,11.0,396891,4303,1711,1
2,339,43,23.7,349874,4166,2122,0
3,173,951,21.0,2147850,5001,4654,0
4,99,255,16.0,411725,3965,1620,1
5,363,1257,24.4,3914071,4928,5634,0
6,253,834,39.2,1326848,4471,7213,0
7,117,162,31.5,162304,4813,5535,0
8,294,229,18.2,164145,4839,7224,1
9,291,1886,31.5,7015251,5408,6113,0


# Removing Outlier

In [7]:
dataset=data.copy()
for feature in numerical_features:
    
    if 0 in dataset[feature].unique():
        pass
    else:
        
        
        q1=dataset[feature].quantile(0.25)
        q3=dataset[feature].quantile(0.75)
        iqr=q3-q1
        lower_lim=q1-1.5*iqr
        upper_lim=q3+1.5*iqr
        
        dataset=dataset[(dataset[feature]<=upper_lim) & (dataset[feature]>=lower_lim)]
        
       

#  Categorical to Numerical Conversion

In [25]:

label_encoder=LabelEncoder()
dataset.City=label_encoder.fit_transform(dataset.City)
dataset

Unnamed: 0,City,COL,PD,URate,Pop,Taxes,Income,RTWL
0,0,169,414,13.6,1790128,5128,2961,1
1,1,143,239,11.0,396891,4303,1711,1
2,2,339,43,23.7,349874,4166,2122,0
3,3,173,951,21.0,2147850,5001,4654,0
4,4,99,255,16.0,411725,3965,1620,1
5,5,363,1257,24.4,3914071,4928,5634,0
6,6,253,834,39.2,1326848,4471,7213,0
7,8,117,162,31.5,162304,4813,5535,0
8,7,294,229,18.2,164145,4839,7224,1
10,9,170,643,29.5,1381196,4637,4806,0


# Handling Ibalancing (oversampled Technique)

In [26]:
new_data=dataset.copy()

In [27]:

item_1=new_data[new_data["RTWL"]==1]
item_0=new_data[new_data["RTWL"]==0]


item_0.shape





(23, 8)

In [28]:
item_1.shape

(10, 8)

In [29]:
# 0 has more counts

#maximise 1 values and make it equal to the 0 
#So we pass 23 to data_over 1 as total data_over_0 are 23

data_over_1=item_1.sample(23,replace=True)
data_over=pd.concat([data_over_1,item_0],axis=0)

data_over.RTWL.value_counts()
#Hence data is already Sampled almost 



1    23
0    23
Name: RTWL, dtype: int64

In [31]:
len(data_over)

46

In [32]:

x=data_over.drop(['RTWL'],axis=1)
y=data_over["RTWL"]

# Splitting Data

In [63]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=15,stratify=y)

In [64]:
from sklearn.ensemble import GradientBoostingClassifier

In [65]:
model=GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=2, random_state=1)
model.fit(x_train, y_train)
model.score(x_test, y_test)


0.9166666666666666

In [66]:
pred=model.predict(x_test)

In [67]:
y_test

4     1
10    0
0     1
37    1
36    1
2     0
13    0
25    1
14    0
1     1
19    0
32    0
Name: RTWL, dtype: int64

# Classification Report

In [68]:

from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.83      0.91         6
           1       0.86      1.00      0.92         6

    accuracy                           0.92        12
   macro avg       0.93      0.92      0.92        12
weighted avg       0.93      0.92      0.92        12

