# CS407 Machine Learning Final

## By: Jack Canducci

### Imports

In [1]:
# General Imports
import pandas as pd
import numpy as np
#import MLLibrary as ml

# Hypothesis Testing
from scipy.stats import norm

# Fuzzy Matching
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from textdistance import levenshtein
from textdistance import jaro_winkler
from textdistance import jaccard

# Linear Programming
from pulp import LpVariable
from pulp import LpSolver
from pulp import LpProblem
from pulp import LpStatus
from pulp import LpInteger
from pulp import LpMaximize
from pulp import LpMinimize

# Data Setup
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Model Training / Testing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

ModuleNotFoundError: No module named 'MLLibrary'

## Regression

#### Setup data

In [None]:
#https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html
californiaHousing = fetch_california_housing()

data = californiaHousing.data
target = californiaHousing.target

dataTrain, dataTest, targetTrain, targetTest = train_test_split(data, target, test_size=0.25)

# Needed for NN
standardScaler = StandardScaler()
scaledDataTrain = standardScaler.fit_transform(dataTrain)
scaledDataTest = standardScaler.transform(dataTest)

#### Linear Regression

In [None]:
linearRegression_model = LinearRegression()

linearRegression_model.fit(dataTrain, targetTrain)

targetPrediction = linearRegression_model.predict(dataTest)

print('Mean Absolute Error:', metrics.mean_absolute_error(targetTest, targetPrediction))  
print('Mean Squared Error:', metrics.mean_squared_error(targetTest, targetPrediction))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(targetTest, targetPrediction))) 

Mean Absolute Error: 0.5378425015997423
Mean Squared Error: 0.5561263424248348
Root Mean Squared Error: 0.7457387896742631


#### Decision Tree

In [None]:
decisionTreeRegressor_model = DecisionTreeRegressor()

decisionTreeRegressor_model.fit(dataTrain, targetTrain)

targetPrediction = decisionTreeRegressor_model.predict(dataTest)

print('Mean Absolute Error:', metrics.mean_absolute_error(targetTest, targetPrediction))  
print('Mean Squared Error:', metrics.mean_squared_error(targetTest, targetPrediction))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(targetTest, targetPrediction))) 

Mean Absolute Error: 0.4558868352713179
Mean Squared Error: 0.5116152133218411
Root Mean Squared Error: 0.7152728243976847


#### Random Forest

In [None]:
randomForestRegressor_model = RandomForestRegressor(n_estimators=100, random_state=0)  

randomForestRegressor_model.fit(dataTrain, targetTrain)  

targetPrediction = randomForestRegressor_model.predict(dataTest)  

print('Mean Absolute Error:', metrics.mean_absolute_error(targetTest, targetPrediction))  
print('Mean Squared Error:', metrics.mean_squared_error(targetTest, targetPrediction))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(targetTest, targetPrediction))) 

Mean Absolute Error: 0.32568516170542655
Mean Squared Error: 0.2525042512946152
Root Mean Squared Error: 0.5024980112344876


#### MLP NN

In [None]:
multiLayerPerceptronRegressor_model = MLPRegressor(hidden_layer_sizes=(64,64,64),activation="relu" ,random_state=1, max_iter=2000)

multiLayerPerceptronRegressor_model.fit(dataTrain, targetTrain)

targetPrediction = multiLayerPerceptronRegressor_model.predict(dataTest)

print('Mean Absolute Error:', metrics.mean_absolute_error(targetTest, targetPrediction))  
print('Mean Squared Error:', metrics.mean_squared_error(targetTest, targetPrediction))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(targetTest, targetPrediction))) 

Mean Absolute Error: 1.3371878702003033
Mean Squared Error: 2.9457267133563194
Root Mean Squared Error: 1.7163119510614377


While the lowest set of errors were coming from my random forest, it seems as though the highest is coming from my neural network. I assume that's due to an underlying factor of how these systems for regression are applicable. My general linear regression model and my decision tree were very in between these two. This shows that my other two models were more middle grounds between the NN and the RF.

## Classification

### Setup Data

In [None]:
pima = pd.read_csv("CSV/pima-indians-diabetes.csv")

data_mod = pima[(pima.BloodP != 0) & (pima.BMI != 0) & (pima.Glucose != 0)]

data = ['Pregnancies', 'Glucose', 'BloodP', 'SkinThick', 'BMI', 'Age', 'Insulin', 'DiabetesPedigreeFunction']
target = "Outcome"

#dataTrain, dataTest, targetTrain, targetTest = train_test_split(data_mod, test_size=0.25)
train, test = train_test_split(data_mod, test_size=0.25)

standardScaler = StandardScaler()
dataTrainScaled=standardScaler.fit_transform(train[data])
dataTestScaled=standardScaler.transform(test[data])


### Naive Bayes

In [None]:
naiveBayes_model = GaussianNB()

naiveBayes_model.fit(train[data], train[target])

targetPrediction = naiveBayes_model.predict(test[data])

print('Accuracy Score: ', round(accuracy_score(test[target], targetPrediction)*100, 2), '%')


Accuracy Score:  71.27 %


### Logistic

In [None]:
logisticRegression_model = LogisticRegression()

logisticRegression_model.fit(train[data], train[target])

targetPrediction = logisticRegression_model.predict(test[data])

print('Accuracy Score: ', round(accuracy_score(test[target], targetPrediction)*100, 2),'%')

Accuracy Score:  72.38 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [None]:
decisionTreeClassifier_model = DecisionTreeClassifier()

decisionTreeClassifier_model.fit(train[data], train[target])

targetPrediction = decisionTreeClassifier_model.predict(test[data])

print('Accuracy Score: ', round(accuracy_score(test[target], targetPrediction)*100, 2),'%')

Accuracy Score:  69.61 %


### Random Forest

In [None]:
randomForestClassifier_model = RandomForestClassifier()

randomForestClassifier_model.fit(train[data], train[target])

targetPrediction = randomForestClassifier_model.predict(test[data])

print('Accuracy Score: ', round(accuracy_score(test[target], targetPrediction)*100, 2),'%')

Accuracy Score:  71.27 %


### MLP NN

In [None]:
multiLayerPerceptronClassifier_model = MLPClassifier(hidden_layer_sizes=(256,128,64,32),activation="relu",random_state=1)

multiLayerPerceptronClassifier_model.fit(dataTrainScaled, train[target])

targetPrediction = multiLayerPerceptronClassifier_model.predict(dataTestScaled)

print('Accuracy Score: ', round(accuracy_score(test[target], targetPrediction) * 100, 2), '%')

Accuracy Score:  67.4 %


Out of all my models, my logistic model actually did the best which is surprising to me. I would have assumed that the NN would do best but it seems it nearly did the worst. Naive Bayes did a little worse with an accuracy score of 74.59% followed by the decision tree, MLP NN, and then the random forest. I'm a little confused about why the random forest did poorly compared to the others so I'll explore more into this later.

## Fuzzy Matching

### Setup Data

In [None]:
stringOne = "Machine Learning is fun"
stringTwo = "Professor Utpal has made me enjoy this subject. That is a feat."

### Simple Ratio

In [None]:
fuzz.ratio(stringOne, stringTwo)

26

### Partial Ratio

In [None]:
fuzz.partial_ratio(stringOne, stringTwo)

35

### Token Sort Ratio

In [None]:
fuzz.token_sort_ratio(stringOne, stringTwo)

26

### Token Set Ratio

In [None]:
fuzz.token_set_ratio(stringOne, stringTwo)

29

### Levenshtein Distance

In [None]:
levenshtein.distance(stringOne, stringTwo)

51

### Jaro-Winker

In [None]:
jaro_winkler.distance(stringOne, stringTwo)

0.5087416609155739

### Jaccard

In [None]:
jaccard.distance(stringOne, stringTwo)

0.7714285714285715

## Linear Programming

### Problem 1

In [None]:
problem = LpProblem("Stratton", LpMaximize)

extrusionType1 = LpVariable("ExtrusionOne",lowBound=0,cat='Continuous')
packagingType1 = LpVariable("PackagingOne",lowBound=0, cat='Continuous')
additionalType1 = LpVariable("AdditionalOne", lowBound=0, cat='Continuous')

extrusionType2 = LpVariable("ExtrusionTwo",lowBound=0,cat='Continuous')
packagingType2 = LpVariable("PackagingTwo",lowBound=0, cat='Continuous')
additionalType2 = LpVariable("AdditionalTwo", lowBound=0, cat='Continuous')

problem += (34 * (extrusionType1 + packagingType1 + additionalType1)) + (40 * (extrusionType2 + packagingType2 + additionalType2))

problem += (extrusionType1 * 4) + (extrusionType2 * 6) <= 48
problem += (packagingType1 * 2) + (packagingType2 * 2) <= 18
problem += (additionalType1 * 2) + (additionalType2 * 1) <= 16 

problem.solve()
LpStatus[problem.status]

for variable in problem.variables():     # Print the Variable values for Optimized Objective
    print(variable.name, '=', variable.varValue)

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/septri/.local/lib/python3.9/site-packages/pulp/apis/../solverdir/cbc/linux/64/cbc /tmp/d85e455232d440e19fa537a02ba69f46-pulp.mps max timeMode elapsed branch printingOptions all solution /tmp/d85e455232d440e19fa537a02ba69f46-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 8 COLUMNS
At line 21 RHS
At line 25 BOUNDS
At line 26 ENDATA
Problem MODEL has 3 rows, 6 columns and 6 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Presolve 0 (-3) rows, 0 (-6) columns and 0 (-6) elements
Empty problem - 0 rows, 0 columns and 0 elements
Optimal - objective value 1408
After Postsolve, objective 1408, infeasibilities - dual 0 (0), primal 0 (0)
Optimal objective 1408 - 0 iterations time 0.002, Presolve 0.00
Option for printingOptions changed from normal to all
Total time (CPU seconds):       0.00   (Wallclock seconds):       0.00


### Problem 2

In [None]:
problem2 = LpProblem("Problem 2", LpMaximize)

pants = LpVariable("Pants", lowBound=0, cat='Continuous')
jackets = LpVariable("Jackets", lowBound=0, cat='Continuous')

problem2 += (50 * (pants)) + (40 * (jackets))

problem2 += (pants * 1) + (jackets * 1.5) <= 750
problem2 += (pants * 2) + (jackets * 1) <= 1000

problem2.solve()
LpStatus[problem2.status]

for variable in problem2.variables():     # Print the Variable values for Optimized Objective
    print(variable.name, '=', variable.varValue)

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/septri/.local/lib/python3.9/site-packages/pulp/apis/../solverdir/cbc/linux/64/cbc /tmp/50e7d3f78fbb4b618cb651987824fd47-pulp.mps max timeMode elapsed branch printingOptions all solution /tmp/50e7d3f78fbb4b618cb651987824fd47-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 7 COLUMNS
At line 14 RHS
At line 17 BOUNDS
At line 18 ENDATA
Problem MODEL has 2 rows, 2 columns and 4 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Presolve 2 (0) rows, 2 (0) columns and 4 (0) elements
0  Obj -0 Dual inf 90 (2)
0  Obj -0 Dual inf 90 (2)
2  Obj 28750
Optimal - objective value 28750
Optimal objective 28750 - 2 iterations time 0.002
Option for printingOptions changed from normal to all
Total time (CPU seconds):       0.00   (Wallclock seconds):       0.00

Jackets = 250.0
Pants = 375.0




## Hypothesis Testing

### Problem 1

In [None]:
x_bar = 1.39
µ = 1.48
s_dev = 0.84
z_score = (x_bar - µ)/s_dev
print("Z-score = ", z_score)
p_value = norm.cdf(z_score) # since it is a Two Tail test
print("p-value = ",p_value)

Z-score =  -0.10714285714285725
p-value =  0.4573378238740764


### Problem 2

In [None]:
x_bar = 160.1
µ = 162.9
s_dev = 1.6
z_score = (x_bar - µ)/s_dev
print("Z-score = ", z_score)
p_value = norm.cdf(z_score) # since it is a Left Tail test
print("p-value = ",p_value)

Z-score =  -1.750000000000007
p-value =  0.040059156863816475
