## Unsupervized Learning

- **CONTEXT**: The data concerns city-cycle fuel consumption in miles per gallon to be predicted in terms of 3 multivalued discrete and 5 continuous attributes

- **DATA DESCRIPTION:**
    * cylinders: multi-valued discrete
    * acceleration: continuous
    * displacement: continuous
    * model year: multi-valued discrete
    * horsepower: continuous
    * origin: multi-valued discrete
    * weight: continuous
    * car name: string (unique for each instance)
    * mpg: continuous
   
- **PROJECT OBJECTIVE:** To understand K-means Clustering by applying on the Car Dataset to segment the cars into various categories.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline




# Data Understanding & Cleaning

In [None]:
df = pd.read_csv("vehicle-1.csv")

In [None]:
df.head()

In [None]:
df.isna().sum()/len(df)

In [None]:
df.dropna(inplace = True)

In [None]:
df.info()

In [None]:
df["class"].value_counts().plot(kind = 'pie',figsize=(11, 6),autopct='%1.2f%%',)
df["class"].value_counts()

In [None]:
df.duplicated().sum()

In [None]:
X = df.drop("class",axis = 1)
y = df['class']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size  = 0.2,random_state = 42)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train  = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()

In [None]:
svc.fit(X_train,y_train)

In [None]:
y_pred = svc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.decomposition import PCA


In [None]:
pca = PCA(n_components=10)

In [None]:
X = scaler.fit_transform(X)

In [None]:
pca_data = pca.fit_transform(X)

In [None]:
pca.explained_variance_

In [None]:
percentage_var_explained=pca.explained_variance_/np.sum(pca.explained_variance_)

cum_var_explained=np.cumsum(percentage_var_explained)

plt.figure(1,figsize=(12,8))
plt.clf()
plt.plot(cum_var_explained,linewidth=2,marker='o')
plt.plot((0,9),(0.9,0.9),'--',marker='o')
plt.grid()
plt.xlabel('n_components')
plt.ylabel('Cumulative_explained_Variance')
plt.show()

In [None]:
pca = PCA(n_components=4)
pca_data = pca.fit_transform(X)

In [None]:
pca_data

In [None]:
pca.n_components_

In [None]:
pca.explained_variance_ratio_

In [None]:
np.sum(pca.explained_variance_ratio_)

In [None]:
svc = SVC()

In [None]:
svc.fit(pca_data,y)

In [None]:
y_pred = svc.predict(pca_data)

In [None]:
print(classification_report(y,y_pred))

# Performance Improvement

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X, y)

In [None]:
print(clf.get_params)
print('-'*40)
clf.best_params_

In [None]:
pred_grid = clf.predict(X)

In [None]:
print(classification_report(y,pred_grid))