In [240]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Question 1

In [241]:
weather_df = pd.read_csv('/content/weather.csv')
weather_df.head()

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,rainy,hot,high,0,0
1,rainy,hot,high,1,0
2,overcast,hot,high,0,1
3,sunny,mild,high,0,1
4,sunny,cool,normal,0,1


In [242]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
weather_df['Outlook'] = le.fit_transform(weather_df['Outlook'])
weather_df['Outlook'].unique()

weather_df['Temp'] = le.fit_transform(weather_df['Temp'])
weather_df['Temp'].unique()

weather_df['Humidity'] = le.fit_transform(weather_df['Humidity'])
weather_df['Humidity'].unique()

array([0, 1])

In [243]:
X = weather_df.iloc[:,0:4]
Y = weather_df.iloc[:,4]

In [244]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [245]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [246]:
dt_pipe = Pipeline([('dt', DecisionTreeClassifier())])
params = [{
    'dt__criterion':['gini', 'entropy'],
    'dt__max_depth':[3, 5, 7, 9, 11],
    'dt__random_state':[42]
}]

dt_pipe.get_params().keys()
gs_dt = GridSearchCV(dt_pipe,
                    param_grid=params,
                    scoring='accuracy',
                    cv=5)
gs_dt.fit(x_train, y_train)
print(gs_dt.best_params_)
print(gs_dt.best_score_)



{'dt__criterion': 'gini', 'dt__max_depth': 3, 'dt__random_state': 42}
0.7


In [247]:
from sklearn.metrics import classification_report
y_pred = gs_dt.predict(X)
print(classification_report(Y, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.78      0.78      0.78         9

    accuracy                           0.71        14
   macro avg       0.69      0.69      0.69        14
weighted avg       0.71      0.71      0.71        14



# Question 2

In [248]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
boston_df = pd.read_csv('/content/housing.csv',  header=None, delimiter=r"\s+", names=column_names)
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(boston_df.corr(), annot=True)

In [250]:
boston_df = boston_df.drop("RAD", axis = 1)
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,222.0,18.7,396.9,5.33,36.2


In [251]:
X = np.array(boston_df.iloc[:,0:12])
y = np.array(boston_df.iloc[:,12])

In [252]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [253]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [254]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)

SVR()

In [255]:
y_pred = svr.predict(X_test)

In [256]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5839563150792234

In [None]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear', 'poly']} 
  
grid = GridSearchCV(SVR(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.145 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.144 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.191 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.290 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.218 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.211 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.212 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.291 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.437 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.328 total time=   0.0s
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.596 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
y_gcv_pred = grid.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_gcv_pred)

# Question 3

In [None]:
cust_df = pd.read_csv('/content/Mall_Customers.csv')
print(cust_df.shape)
print(cust_df.head())

In [None]:
cust_df.isna().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cust_df['Gender'] = le.fit_transform(cust_df['Gender'])
cust_df.head()

In [None]:
cust_df.drop('CustomerID', axis=1, inplace=True)
cust_df.head()

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
y_kmeans=kmeans.fit_predict(cust_df)

In [None]:
y_kmeans

In [None]:
cust_df_np = np.array(cust_df)

In [None]:
centers = kmeans.cluster_centers_

plt.scatter(cust_df_np[:, 2], cust_df_np[:, 3], c=y_kmeans, s=40, cmap='viridis')
plt.scatter(centers[:, 2], centers[:, 3], c='black', s=200, alpha=0.5);

# Question 4

In [None]:
cust_df = pd.read_csv('/content/Mall_Customers.csv')
print(cust_df.shape)
print(cust_df.head())

In [None]:
cust_df.isna().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cust_df['Gender'] = le.fit_transform(cust_df['Gender'])
cust_df.head()

In [None]:
cust_df.drop('CustomerID', axis=1, inplace=True)
cust_df.head()

In [None]:
from sklearn.cluster import AgglomerativeClustering
agg = AgglomerativeClustering(n_clusters=5)
y_agg = agg.fit_predict(cust_df)

In [None]:
plt.scatter(cust_df_np[:, 2], cust_df_np[:, 3], c=y_agg, s=40, cmap='viridis')

# Question 5

In [None]:
import tensorflow as tf

In [None]:
diabetes_df = pd.read_csv('/content/diabetes.csv')
diabetes_df.head()

In [None]:
diabetes_df.shape

In [None]:
X = np.array(diabetes_df.loc[:,'Pregnancies':'Age'])
Y = np.array(diabetes_df.loc[:,'Outcome'])
print(type(X))
print(type(Y))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
ann = tf.keras.models.Sequential()
# Add the input layer and first hidden layer
ann.add(tf.keras.layers.Dense(units=12, activation='relu', input_shape=X_train[0].shape))
# Add the second hidden layer
ann.add(tf.keras.layers.Dense(units=8, activation='relu'))
# Add the output layer
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))


In [None]:
from tensorflow.keras.utils import plot_model
plot_model(ann,
           to_file="model.png",
           show_shapes=True,
           show_layer_names=True,
          )

In [None]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
ann.fit(X_train, y_train, batch_size = 32, epochs = 200, verbose=0)

In [None]:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

# Question 6

In [None]:
cancer_df = pd.read_csv('/content/breast_cancer.csv')
cancer_df.head()

In [None]:
cancer_df.shape

In [None]:
le = LabelEncoder()
cancer_df['diagnosis'] = le.fit_transform(cancer_df['diagnosis'])

In [None]:
X = np.array(cancer_df.iloc[:,1:31])
Y = np.array(cancer_df.iloc[:,0])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
ann = tf.keras.models.Sequential()
# Add the input layer and first hidden layer
ann.add(tf.keras.layers.Dense(units=12, activation='relu', input_shape=X_train[0].shape))
# Add the second hidden layer
ann.add(tf.keras.layers.Dense(units=8, activation='relu'))
# Add the output layer
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))


In [None]:
from tensorflow.keras.utils import plot_model
plot_model(ann,
           to_file="model.png",
           show_shapes=True,
           show_layer_names=True,
          )

In [None]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
ann.fit(X_train, y_train, batch_size = 50, epochs = 200, verbose=0)

In [None]:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))