In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [164]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [165]:
df = pd.read_csv('placement.csv')

In [166]:
df.head()

Unnamed: 0,cgpa,placement_exam_marks,placed
0,7.19,26.0,1
1,7.46,38.0,1
2,7.54,40.0,1
3,6.42,8.0,1
4,7.23,17.0,0


In [167]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cgpa                  1000 non-null   float64
 1   placement_exam_marks  1000 non-null   float64
 2   placed                1000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 23.6 KB


# Normally predicting 

In [168]:
x = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [169]:
tnf = ColumnTransformer([
    ('tnf1', StandardScaler(), ['cgpa', 'placement_exam_marks'])
])

In [170]:
trfd_xtrain = tnf.fit_transform(x_train)
trfd_xtest = tnf.transform(x_test)

In [171]:
# predicting the vals

clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

clf.fit(trfd_xtrain, y_train)
clf2.fit(trfd_xtrain, y_train)

pred = clf.predict(trfd_xtest)
pred2 = clf2.predict(trfd_xtest)

print(accuracy_score(pred, y_test))
print(accuracy_score(pred2, y_test))

0.5466666666666666
0.5533333333333333


In [172]:
print(np.mean(cross_val_score(estimator=clf, X=trfd_xtrain, y=y_train, cv=10)))
print(np.mean(cross_val_score(estimator=clf2, X=trfd_xtrain, y=y_train, cv=10)))

0.5235294117647058
0.47411764705882353


# Dealing with outliers

In [54]:
x = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [55]:
# find outliers using StandardScaler

sclr = StandardScaler()
new_df = sclr.fit_transform(pd.DataFrame(x_train['cgpa']))


In [56]:
# using the z score formula

mean = x_train['cgpa'].mean()
std = x_train['cgpa'].std()

new_xtrain = (x_train['cgpa'] - mean) / std

In [66]:
new_xtrain[new_xtrain > 3]

996    3.552555
Name: cgpa, dtype: float64

In [62]:
mean = x_train['cgpa'].mean()
std = x_train['cgpa'].std()

new_xtest = (x_test['cgpa'] - mean) / std

In [67]:
new_xtest[new_xtest > 3]

995    3.142761
Name: cgpa, dtype: float64

when working with this, apply training's mean and std to find the outliers in test -- the correct way to find

In [69]:
# finding outliers using a new col

df['cgpa_zscore'] = (df['cgpa'] - df['cgpa'].mean()) / df['cgpa'].std()

In [75]:
df [df['cgpa_zscore'] < -3]

Unnamed: 0,cgpa,placement_exam_marks,placed,cgpa_zscore
485,4.92,44.0,1,-3.314251
997,4.89,34.0,0,-3.36296
999,4.9,10.0,1,-3.346724


# Trimming 

In [173]:
x = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [175]:
# finding outliers

maxi = (x_train['cgpa'].mean()) + (3*(x_train['cgpa'].std()))
mini = (x_train['cgpa'].mean()) - (3*(x_train['cgpa'].std()))


mask1 = x_train['cgpa'] < maxi
mask2 = x_train['cgpa'] > mini

clean_xtrain = x_train[(mask1) & (mask2)]

In [176]:
clean_ytrain = y_train[(x_train['cgpa'] < maxi) & (x_train['cgpa'] > mini)]

In [177]:
clean_xtrain

Unnamed: 0,cgpa,placement_exam_marks
643,7.59,28.0
158,5.74,50.0
977,7.18,49.0
429,7.35,55.0
941,7.18,8.0
...,...,...
106,6.59,22.0
270,6.80,16.0
860,7.51,21.0
435,7.45,37.0


In [178]:
mask1 = x_test['cgpa'] < maxi
mask2 = x_test['cgpa'] > mini

clean_xtest = x_test[(mask1) & (mask2)]

In [179]:
clean_ytest = y_test[(mask1) & (mask2)]

In [180]:
clean_ytest

521    1
737    1
740    1
660    1
411    0
      ..
914    0
810    1
244    0
822    1
321    0
Name: placed, Length: 149, dtype: int64

In [181]:
tnf = ColumnTransformer([
    ('tnf1', StandardScaler(), ['cgpa', 'placement_exam_marks'])
])

In [182]:
trfd_xtrain = tnf.fit_transform(clean_xtrain)
trfd_xtest = tnf.transform(clean_xtest)

In [183]:
# predicting the vals

clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

clf.fit(trfd_xtrain, clean_ytrain)
clf2.fit(trfd_xtrain, clean_ytrain)

pred = clf.predict(trfd_xtest)
pred2 = clf2.predict(trfd_xtest)

print(accuracy_score(pred, clean_ytest))
print(accuracy_score(pred2, clean_ytest))

0.5369127516778524
0.5436241610738255


# Capping

In [216]:
x = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [217]:
# finding outliers

maxi = (x_train['cgpa'].mean()) + (3 * (x_train['cgpa'].std()))
mini = (x_train['cgpa'].mean()) - (3 * (x_train['cgpa'].std()))

x_train1 = x_train[((x_train['cgpa']) > maxi) | ((x_train['cgpa']) < mini)]

In [223]:
x_train[(x_train['cgpa']) > maxi] = maxi

In [225]:
x_train[(x_train['cgpa']) < mini] = mini

In [219]:
x_test1 = x_test[((x_test['cgpa']) > maxi) | ((x_test['cgpa']) < mini)]

In [227]:
x_test[(x_test['cgpa']) > maxi] = maxi

In [229]:
x_test[(x_test['cgpa']) < mini] = mini

In [231]:
tnf = ColumnTransformer([
    ('trf1', StandardScaler(), ['cgpa', 'placement_exam_marks'])
], remainder='passthrough')

In [232]:
trfd_xtrain = tnf.fit_transform(x_train)
trfd_xtest = tnf.transform(x_test)

In [234]:
# predicting the vals

clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

clf.fit(trfd_xtrain, y_train)
clf2.fit(trfd_xtrain, y_train)

pred = clf.predict(trfd_xtest)
pred2 = clf2.predict(trfd_xtest)

print(accuracy_score(pred, y_test))
print(accuracy_score(pred2, y_test))

0.54
0.54
