# CSCI 390: Machine Learning

File name: Turauth_HW3.ipynb

The purpose of this file is to explore the application of linear regression and Support Vector Machines.

## Part 1

In [1]:
# Mount Google Drive to this notebook.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import libraries.
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler # (https://towardsai.net/p/data-science/how-when-and-why-should-you-normalize-standardize-rescale-your-data-3f083def38ff)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import sklearn.model_selection as model
import sklearn.neighbors as nbrs
import sklearn.feature_selection as featsel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # https://medium.com/@basumatary18/implementing-linear-regression-on-california-housing-dataset-378e14e421b7

# Import the data set (https://medium.com/@basumatary18/implementing-linear-regression-on-california-housing-dataset-378e14e421b7) and assign the features and target
# to X and y.
housing = fetch_california_housing(as_frame=True)
X = housing.data
y = housing.target.to_numpy().reshape(-1,1)

# There is no need to check for data errors or anomalies or to examine and replace missing values because the dataset is well-known. There are no computed fields
# being considered in this exercise.

# Normalize the data.
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(y)

#Partition the data into training and test data.
X_train, X_test, y_train, y_test = model.train_test_split(X, y, test_size=.3, random_state=0)

# Make a linear regressor, provide the final accuracy, and print out the coefficient.
reg = LinearRegression().fit(X, y)
print(round(reg.score(X, y),3))
print(housing.feature_names)
reg.coef_

0.606
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


array([[ 1.30559051,  0.09922117, -3.12146391,  4.48662399, -0.02925217,
        -0.97016368, -0.81743339, -0.89948456]])

The coefficients indicate how great the effect of each feature is on the target, the housing price.
The greater the coefficient, the greater is the effect of the corresponding on the target.
Features AveRooms and AveBedrms have a greater effect on the target than the other features.
Features AveRooms, Population, AveOccup, Latitude, and Longitude are inversely proportional to
the target.

## Part 2

In [4]:
# Import libraries.
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import sklearn.model_selection as model
from sklearn.svm import SVC
from scipy import stats
import pandas as pd
import numpy as np

# Show the file where to find the weather data, read it, and drop rows where the target value is missing.
pathName = "/content/drive/MyDrive/Colab Notebooks/HW3/"
df1 = pd.read_excel(pathName + 'weatherAUS100.xlsx', sheet_name='Data')
df2 = pd.read_excel(pathName + 'weatherAUS100.xlsx', sheet_name='Bounds')
df1.dropna(subset='RainToday',inplace=True)
X = df1.drop(['RainToday'], axis=1)
y = df1.RainToday

# Remove data errors and anomalies from the feature data.
X2 = df2.drop(['Stats'], axis=1)
X2.to_numpy()
for feature, item in X.items(): # https://note.nkmk.me/en/python-pandas-dataframe-for-iteration/
  # Access bounds using .loc with column name and index label.
  lower_bound = X2.loc[2, feature]
  upper_bound = X2.loc[3, feature]
  # Replace values outside bounds with NaN.
  X.loc[X[feature] < lower_bound, feature] = np.nan
  X.loc[X[feature] > upper_bound, feature] = np.nan

# Impute missing feature values.
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Normalize the data and partition it into training and testing data.
y.to_numpy().reshape(-1,1)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = model.train_test_split(X, y, test_size=.3, random_state=0)

# Train and measure the performance of the classifier.
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
  clf = SVC(kernel=kernel).fit(X_train, y_train)
  print('The ' + kernel + ' SVM score is %f' % clf.score(X_train, y_train))

The linear SVM score is 0.897059
The poly SVM score is 0.970588
The rbf SVM score is 0.911765
The sigmoid SVM score is 0.897059


The above version uses only the first hundred rows of the dataset. When using the full dataset, the rbf kernel was found to be the most accurate.

In [5]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

# Show the file where to find the weather data, read it, and drop rows where the target value is
pathName = "/content/drive/MyDrive/Colab Notebooks/HW3/"
df1 = pd.read_excel(pathName + 'weatherAUS1200.xlsx', sheet_name='Data')
df2 = pd.read_excel(pathName + 'weatherAUS1200.xlsx', sheet_name='Bounds')
df1.dropna(subset='RainToday',inplace=True)
X = df1.drop(['RainToday'], axis=1)
y = df1.RainToday

# Remove data errors and anomalies from the feature data.
X2 = df2.drop(['Stats'], axis=1)
X2.to_numpy()
for feature, item in X.items(): # https://note.nkmk.me/en/python-pandas-dataframe-for-iteration/
  # Access bounds using .loc with column name and index label.
  lower_bound = X2.loc[2, feature]
  upper_bound = X2.loc[3, feature]
  # Replace values outside bounds with NaN.
  X.loc[X[feature] < lower_bound, feature] = np.nan
  X.loc[X[feature] > upper_bound, feature] = np.nan

# Impute missing feature values.
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Normalize the data.
y.to_numpy().reshape(-1,1)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Use GridSearchCV to find the values of C and gamma that give the highest accuracy.
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X, y)

print(
"The best parameters are %s with a score of %0.2f."
% (grid.best_params_, grid.best_score_)
)

KeyboardInterrupt: 

My latest version was only saved as a PDF.

The output of the above code was this: "
The best parameters are {'C': 10000.0, 'gamma': 0.1} with a score of 0.90."

The above accuracy score is for the subsample including the first 1200 rows of the data.

In [None]:
# Import libraries.
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import sklearn.model_selection as model
from sklearn.svm import SVC
from scipy import stats
import pandas as pd
import numpy as np

# Show the file where to find the weather data, read it, and drop rows where the target value is
pathName = "/content/drive/MyDrive/Colab Notebooks/HW3/"
df1 = pd.read_excel(pathName + 'weatherAUS.xlsx', sheet_name='Data')
df2 = pd.read_excel(pathName + 'weatherAUS.xlsx', sheet_name='Bounds')
df1.dropna(subset='RainToday',inplace=True)
X = df1.drop(['RainToday'], axis=1)
y = df1.RainToday

# Remove data errors and anomalies from the feature data.
X2 = df2.drop(['Stats'], axis=1)
X2.to_numpy()
for feature, item in X.items(): # https://note.nkmk.me/en/python-pandas-dataframe-for-iteration/
  # Access bounds using .loc with column name and index label.
  lower_bound = X2.loc[2, feature]
  upper_bound = X2.loc[3, feature]

# Replace values outside bounds with NaN.
X.loc[X[feature] < lower_bound, feature] = np.nan
X.loc[X[feature] > upper_bound, feature] = np.nan

# Impute missing feature values.
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Normalize the data and partition it into training and testing data.
y.to_numpy().reshape(-1,1)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = model.train_test_split(X, y, test_size=.3, random_state=0)
clf = SVC(C=10000, kernel='rbf', gamma=0.1).fit(X_train, y_train)

# Train and measure the performance of the classifier using the best parameters for the rbf kerne
print('The best rbf setting SVM score for the entire model is %f' % clf.score(X_train, y_train))

A few lines of this last section were cut off in the PDF, but I filled them in and am pretty sure the code shown gave the following output: "The best rbf setting SVM score for the entire model is 0.836858".