In [1]:
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix
from pandas_profiling import ProfileReport

In [2]:
df=pd.read_csv("winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
#ProfileReport(df)

In [4]:
X=df.drop("quality",axis=1)
y=df["quality"]

In [5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=45)

### KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier

In [7]:
knn=KNeighborsClassifier()  #by default the n_neighbors is 5.
knn.fit(X_train,y_train)

In [8]:
knn.score(X_test,y_test)  #By this we got the accuracy of 0.54

0.54

Using GridSearchCV on top of KNN model to find the best parameters to increase the model accuracy.

In [9]:
param={
    "n_neighbors":range(1,50,1),
    "weights":['uniform', 'distance'],
    "algorithm":["auto","ball_tree","kd_tree","brute"],
    "leaf_size":[10,15,20,25,30,35,40,45,50],
    "p":[1,2]
}

In [10]:
grid_cv=GridSearchCV(knn, param_grid=param, n_jobs=-1)

In [11]:
grid_cv.fit(X_train,y_train)

In [12]:
grid_cv.best_params_  #we found that the best n_neighbors parameter is 1.

{'algorithm': 'auto',
 'leaf_size': 10,
 'n_neighbors': 41,
 'p': 1,
 'weights': 'distance'}

In [13]:
knn=KNeighborsClassifier(n_neighbors=41, p=1, algorithm="auto", weights="distance",leaf_size=10 )  
#Now using the best parameter and testing the accuracy of the model.
knn.fit(X_train,y_train)  
#p value denotes the distance algorithm which algorithm to choose.
#When p = 1, this is equivalent to using manhattan_distance (l1)
#Euclidean_distance(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

In [14]:
knn.score(X_test,y_test) #The accuracy increased from 0.54 to 0.605.

0.645

In [15]:
import pickle
pickle.dump(knn, open("knn.pkl", "wb"))

In [16]:
#ls

In [17]:
knn.predict([X_test.iloc[0]])



array([6], dtype=int64)

In [18]:
X_test.iloc[0]

fixed acidity            8.0000
volatile acidity         0.7050
citric acid              0.0500
residual sugar           1.9000
chlorides                0.0740
free sulfur dioxide      8.0000
total sulfur dioxide    19.0000
density                  0.9962
pH                       3.3400
sulphates                0.9500
alcohol                 10.5000
Name: 69, dtype: float64

## SVM

In [19]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [20]:
from sklearn.svm import SVC

In [21]:
svc= SVC()
svc.fit(X_train,y_train)

In [22]:
svc.score(X_test,y_test)

0.505

In [23]:
param={
    "kernel":["rbf","poly","sigmoid","linear"], #rbf :radial bias function
    "C":[0.1,0.4,0.6,1,2,3,100,200,500,1000],
    "gamma":[0.001,0.1,0.4,0.004]
    
    
}

In [24]:
svm_grid= GridSearchCV(svc, param_grid=param,verbose=3, n_jobs=-1)

In [25]:
#svm_grid.fit(X_train,y_train)

In [27]:
#svm_grid.best_params_

In [28]:
#svc= SVC(kernel="linear")
#svc.fit(X_train,y_train)

In [29]:
#svc.score(X_test,y_test)

### SVR 

In [30]:
df=pd.read_csv("Admission_Prediction.csv")
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.0,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [32]:
df.isnull().sum()

Serial No.            0
GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [44]:
df["GRE Score"]=df["GRE Score"].fillna(df["GRE Score"].mean())
df["TOEFL Score"]=df["TOEFL Score"].fillna(df["TOEFL Score"].mean())
df["University Rating"]=df["University Rating"].fillna(df["University Rating"].mean())

In [45]:
X=df.drop(["Serial No.","Chance of Admit"],axis=1)
y=df["Chance of Admit"]

In [46]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=45,test_size=0.25)

In [47]:
from sklearn.svm import SVR
svr=SVR()
svr.fit(X_train,y_train)

In [48]:
svr.score(X_test,y_test)

0.6799358539239625

In [49]:
from sklearn.metrics import r2_score

In [51]:
r2_score(y_test,svr.predict(X_test))

0.6799358539239625

In [58]:
X

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.000000,118.0,4.0,4.5,4.5,9.65,1
1,324.000000,107.0,4.0,4.0,4.5,8.87,1
2,316.558763,104.0,3.0,3.0,3.5,8.00,1
3,322.000000,110.0,3.0,3.5,2.5,8.67,1
4,314.000000,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1
496,337.000000,117.0,5.0,5.0,5.0,9.87,1
497,330.000000,120.0,5.0,4.5,5.0,9.56,1
498,312.000000,103.0,4.0,4.0,5.0,8.43,0


In [59]:
X_test.iloc[0]

GRE Score            298.00
TOEFL Score          105.00
University Rating      3.00
SOP                    3.50
LOR                    4.00
CGPA                   8.54
Research               0.00
Name: 204, dtype: float64

In [62]:
svr.predict([X.iloc[0]])



array([0.87949875])

In [64]:
y[0]

0.92

In [65]:
svr.score(X_test,y_test)

0.6799358539239625