# Importing Libraries and Datasets

### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.impute import KNNImputer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Importing the datasets

In [2]:
# assigning column names from dataset details
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataset = pd.read_csv(url, names=names)
X = dataset.iloc[:, :-1].values #all rows, EXCEPT last column for independent variable
y = dataset.iloc[:, -1].values #all rows, and just last column for depent variable vector 

print(dataset.head())

   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1


### Encoding categorical data

In [3]:
# all data is nominal

# Requirement 1

## Splitting the dataset into the Training, Test, and Validation sets 

In [4]:
#60/20/20 training split. train/validation/test
from sklearn.model_selection import train_test_split

#splitting twice for 60/20/20 split
X_leftover, X_test, y_leftover, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) #800 leftover 200 test
X_train, X_valid, y_train, y_valid = train_test_split(X_leftover, y_leftover, test_size = 0.25, random_state = 0) #25% valid, 75% test, 200, 600

In [5]:
#checking lengths just in case
print(len(X_train))
print(len(y_train))
print(len(X_valid))
print(len(y_valid))
print(len(X_test))
print(len(y_test))


460
460
154
154
154
154


## Taking Care of Misssing Data (R1.2)

### Checking for Missing Data

In [6]:
# checkl if there is any missing data
columns_with_zeros_as_missing = ['plas', 'pres', 'skin', 'test', 'mass']

missing_values = dataset[columns_with_zeros_as_missing].apply(lambda x: (x == 0).sum())
print(missing_values)

num_rows = dataset.shape[0]
print(f"The dataset has {num_rows} rows.")

plas      5
pres     35
skin    227
test    374
mass     11
dtype: int64
The dataset has 768 rows.


In [7]:
print(dataset.head())

   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1


### Taking Care of Missing Data (a lot)

#### since there is so much missing data, i will use k nearest neighbors to fill in the missing data instead of just the mean

In [8]:
# turning object data types into numeric bc of errors 
for column in ['plas', 'pres', 'skin', 'test', 'mass']:
    dataset[column] = pd.to_numeric(dataset[column], errors='coerce')

In [9]:
# large amounts of missing data awas found. We will use k nneighbors to best fill up the dataset
columns_with_zeros_as_missing = ['plas', 'pres', 'skin', 'test', 'mass']
dataset[columns_with_zeros_as_missing] = dataset[columns_with_zeros_as_missing].replace(0, np.nan)

print(dataset.dtypes)

imputer = KNNImputer(n_neighbors=5, weights='uniform')
imputed_data = imputer.fit_transform(dataset)
dataset_imputed = pd.DataFrame(imputed_data, columns=dataset.columns)

# Display the head of the imputed DataFrame to verify changes
print(dataset_imputed.head())

preg       int64
plas     float64
pres     float64
skin     float64
test     float64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object
   preg   plas  pres  skin   test  mass   pedi   age  class
0   6.0  148.0  72.0  35.0  169.0  33.6  0.627  50.0    1.0
1   1.0   85.0  66.0  29.0   58.6  26.6  0.351  31.0    0.0
2   8.0  183.0  64.0  25.8  164.6  23.3  0.672  32.0    1.0
3   1.0   89.0  66.0  23.0   94.0  28.1  0.167  21.0    0.0
4   0.0  137.0  40.0  35.0  168.0  43.1  2.288  33.0    1.0


# Requirement 2



### Generating Perceptron Model and Deploying on Test Data

In [10]:
# create perceptron
perceptron = Perceptron(max_iter=1000, tol=1e-3, random_state=0)

perceptron.fit(X_train, y_train)
y_pred_valid = perceptron.predict(X_valid)

#printing out initial validation scores
f1_scores_per_class = f1_score(y_valid, y_pred_valid, average=None)
f1_score_weighted = f1_score(y_valid, y_pred_valid, average='weighted')
print("f1 score:", f1_score_weighted)
print("f1 scores for each class:", f1_scores_per_class)


f1 score: 0.6079399453244859
f1 scores for each class: [0.67379679 0.49586777]


# Requirement 3

## Applying SMOTE to Balance the Dataset

In [11]:
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print('original dataset: ', Counter(y_train))
print('resampled dataset: ', Counter(y_train_resampled))

original dataset:  Counter({0: 296, 1: 164})
resampled dataset:  Counter({0: 296, 1: 296})


## Generating NEW Linear Perceptron on the Oversampled Dataset

### Making New Perceptron Model and Displaying Results

In [12]:
# generate new linear perceptron model on oversampled data set
perceptron.fit(X_train_resampled, y_train_resampled)
y_pred_test = perceptron.predict(X_test)

#printing scores for each class post smotenc
f1oversampled = f1_score(y_test, y_pred_test, average=None)
f1_scores_per_class2 = f1_score(y_test, y_pred_test, average=None)
print(f"post smotenc f1 Score:")
print("post smotenc f1 Scores for Each Class:", f1_scores_per_class)

post smotenc f1 Score:
post smotenc f1 Scores for Each Class: [0.67379679 0.49586777]


# Requirement 5

### Idenfitying the Least Significant Predictors

In [13]:
#using pandas lib to find correlations
correlation_matrix = dataset_imputed.corr()
target_correlation = correlation_matrix['class'].sort_values()
print(target_correlation)


pedi     0.173844
pres     0.176665
preg     0.221898
age      0.238356
skin     0.279530
mass     0.313882
test     0.320151
plas     0.495853
class    1.000000
Name: class, dtype: float64


## Dropping Two Least Significant Features

### Dropping the features and displaying F values

Check the cell below this shit has to be fuckin wrong

In [14]:
print(dataset_imputed.head())

   preg   plas  pres  skin   test  mass   pedi   age  class
0   6.0  148.0  72.0  35.0  169.0  33.6  0.627  50.0    1.0
1   1.0   85.0  66.0  29.0   58.6  26.6  0.351  31.0    0.0
2   8.0  183.0  64.0  25.8  164.6  23.3  0.672  32.0    1.0
3   1.0   89.0  66.0  23.0   94.0  28.1  0.167  21.0    0.0
4   0.0  137.0  40.0  35.0  168.0  43.1  2.288  33.0    1.0


In [15]:
print(Counter(y_train))
print(Counter(y_train_resampled))

Counter({0: 296, 1: 164})
Counter({0: 296, 1: 296})


In [16]:
#drop least 2 sig var: pedi pres
dataset_reduced = dataset_imputed.drop(['pedi', 'pres'], axis=1)

# redoing some preprocessing bc you dropped columns
X = dataset_reduced.iloc[:, :-1].values
y = dataset_reduced.iloc[:, -1].values   

#splitting twice for 60/20/20 split
X_leftover, X_test, y_leftover, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) #800 leftover 200 test
X_train, X_valid, y_train, y_valid = train_test_split(X_leftover, y_leftover, test_size = 0.25, random_state = 0) #25% valid, 75% test, 200, 600

# #redoing smote after the drop so the baby variables dont have it either. it should be the same otherwise tho
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# feature scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# new perceptron again
perceptron = Perceptron(max_iter=1000, tol=1e-3, random_state=0)
perceptron.fit(X_train_scaled, y_train_resampled)

# f1 scores
y_pred_valid = perceptron.predict(X_valid_scaled)
f1_valid = f1_score(y_valid, y_pred_valid, average='weighted')  # Using 'weighted' to handle class imbalance
print(f"F1 Score: {f1_valid:.4f}")
f1_valid = f1_score(y_valid, y_pred_valid, average=None)
print(f"F1 score for each class: {f1_valid}")

F1 Score: 0.7314
F1 score for each class: [0.83555556 0.55421687]


accuracy of class of interest was increased

# Requirement 6

### Identify the Two Most Significant Features

In [17]:
correlation_matrix = dataset.corr()
target_correlation = correlation_matrix['class'].sort_values()
print(target_correlation)

pres     0.170589
pedi     0.173844
preg     0.221898
age      0.238356
skin     0.259491
test     0.303454
mass     0.313680
plas     0.494650
class    1.000000
Name: class, dtype: float64


# Requirement 7

## Tunning the Oversampling Ratio and n-values

### Using Grid Search on the Validation Set to find optimal tuning

In [18]:
print(Counter(y_train))

Counter({0.0: 296, 1.0: 164})


In [19]:
# setting up the pipline
pipeline = ImPipeline([
    ('smote', SMOTE(random_state=42)),
    ('perceptron', Perceptron(max_iter=1000, tol=1e-3, random_state=0))
])

# setting neighbors for the search to try
param_grid = {
    'smote__sampling_strategy': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0],  # Adjust based on your dataset's imbalance
    'smote__k_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # Fewer values for initial testing
}

# actually doing the grid search
grid_search = GridSearchCV(pipeline, param_grid, scoring='f1_weighted', cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)

# using best estimator to predict on validation set
best_estimator = grid_search.best_estimator_
y_pred_valid = best_estimator.predict(X_valid)

# printing f1 scores
f1_valid = f1_score(y_valid, y_pred_valid, average='weighted')
print(f"F1 Score: {f1_valid:.4f}")
f1_scores_per_class_valid = f1_score(y_valid, y_pred_valid, average=None)
print("F1 Scores for Each Class :", f1_scores_per_class_valid)

Best parameters found:  {'smote__k_neighbors': 6, 'smote__sampling_strategy': 1.0}
F1 Score: 0.5252
F1 Scores for Each Class : [0.48529412 0.59302326]


250 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\noetr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\noetr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\noetr\AppData

: 