In [None]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import random
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import svm

Here is a link to both datasets I used: https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand, https://www.kaggle.com/datasets/pkdarabi/diabetes-dataset-with-18-features

## Load The Data

In [None]:
df = pd.read_csv("diabetes.csv")

In [None]:
df

## Standardize Features

In [None]:
for column in df.columns:
    if column != 'Diabetes':  
        df[column] = (df[column] - df[column].mean()) / df[column].std()

In [None]:
X = df.drop(columns=['Diabetes']) 
y = df['Diabetes'] 

## General Cross_Validate Function To Use

In [None]:
def cross_validate(X, y, num_iter=10):
    results = {'precision': [], 'recall': [], 'f1': []}
    for i in range(num_iter):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        
        log_reg_model = LogisticRegression()
        
        log_reg_model.fit(X_train, y_train)
        
        y_pred = log_reg_model.predict(X_test)
        
        p, r, f, _ = precision_recall_fscore_support(y_test, y_pred, pos_label=1, average='binary')
        
        results['precision'].append(p)
        results['recall'].append(r)
        results['f1'].append(f)
    
    return results


## Values without any feature selection

In [None]:
cv_results = cross_validate(X, y)

mean_precision = sum(cv_results['precision']) / len(cv_results['precision'])
mean_recall = sum(cv_results['recall']) / len(cv_results['recall'])
mean_f1 = sum(cv_results['f1']) / len(cv_results['f1'])

print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1-score:", mean_f1)

## Analyze The Features

In [None]:
logit_model = sm.Logit(y, X) 
logit_result = logit_model.fit()

print("Coefficients:")
print(logit_result.params)

print("Model summary:")
print(logit_result.summary())

In [None]:
p_values = logit_result.pvalues

insignificant_features = p_values[p_values > 0.05]

print("Features with p-values > 0.05:")
print(insignificant_features)

## Values Of Logistical Regression Without Features with p-values > .05

In [None]:
log_reg_model = LogisticRegression()

X = df.drop(columns=["SBP", "DBP", "ALT", "BUN", "CCR"])
cv_results = cross_validate(X, y)

mean_precision = sum(cv_results['precision']) / len(cv_results['precision'])
mean_recall = sum(cv_results['recall']) / len(cv_results['recall'])
mean_f1 = sum(cv_results['f1']) / len(cv_results['f1'])

print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1-score:", mean_f1)

## Analyze Data Pieces To See If Perfect Scores Is Just a fluke

In [None]:
target_variable = "Diabetes"
for column in df.columns:
    X = df[[column]]
    y = df[target_variable]

    lm = LogisticRegression()

    lm.fit(X, y)

    y_pred = lm.predict(X)

    display(y_pred)

    display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred), max(y_pred)))

    p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[1,0])
    display('precision = {}'.format(p))
    display('recall = {}'.format(r))
    display('f-score = {}'.format(f))
    plt.figure(figsize=(8, 6)) 
    plt.scatter(df[column], df[target_variable], alpha=0.5)
    plt.title(f'{column} vs. {target_variable}')
    plt.xlabel(column)
    plt.ylabel(target_variable)
    plt.show() 

In [None]:
df2 = pd.read_csv("hotel_bookings.csv")


## Make Some Analysis

In [None]:
for column in df2.columns:
    print(df2[column])

## Handle Null Values

In [None]:
null_percentages = (df2.isnull().sum() / len(df2)) * 100
print("Percentage of null values in each column:")
print(null_percentages)

threshold = 10 

columns_to_drop = null_percentages[null_percentages > threshold].index

print(columns_to_drop)

df2_cleaned = df2.drop(columns=columns_to_drop)

print("DataFrame after removing columns with high null percentages:")
print(df2_cleaned)
df2 = df2.dropna()
print(df2)


## Remove reservation status date because it isn't categorical or numberic in nature

In [None]:
df2 = df2.drop(columns = "reservation_status_date")

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for column in df2.columns:
    if df2[column].dtype == 'object': 
        df2[column] = label_encoder.fit_transform(df2[column])

print("DataFrame after label encoding non-numeric columns:")
print(df2)


In [None]:
for column in df2.columns:
    if column != 'is_canceled':  
        df2[column] = (df2[column] - df2[column].mean()) / df2[column].std()

In [None]:
null_percentages = (df2.isnull().sum() / len(df2)) * 100
print("Percentage of null values in each column:")
print(null_percentages)

threshold = 10

columns_to_drop = null_percentages[null_percentages > threshold].index

print(columns_to_drop)

df2 = df2.drop(columns=columns_to_drop)

## Perform Simple Analysis On Data

In [None]:
X = df2.drop(columns=['is_canceled']) 
y = df2['is_canceled']

In [None]:
print(X)
print(y)
cv_results = cross_validate(X, y)

mean_precision = sum(cv_results['precision']) / len(cv_results['precision'])
mean_recall = sum(cv_results['recall']) / len(cv_results['recall'])
mean_f1 = sum(cv_results['f1']) / len(cv_results['f1'])

print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1-score:", mean_f1)

## Perform Feature Selection On Data

In [None]:
logit_model = sm.Logit(y, X) 
logit_result = logit_model.fit()

print("Coefficients:")
print(logit_result.params)

print("Model summary:")
print(logit_result.summary())

## Keep the best 5 feature p-values

In [None]:
p_values = logit_result.pvalues

sorted_p_values = p_values.sort_values()

significant_features = sorted_p_values[:5]

print("Top 5 features with the smallest p-values:")
print(significant_features)
significant_feature_names = significant_features.index.tolist()

X = df2[significant_feature_names]

print("DataFrame with only the top 5 significant features:")
print(X)

In [None]:
log_reg_model = LogisticRegression()
cv_results = cross_validate(X, y)

mean_precision = sum(cv_results['precision']) / len(cv_results['precision'])
mean_recall = sum(cv_results['recall']) / len(cv_results['recall'])
mean_f1 = sum(cv_results['f1']) / len(cv_results['f1'])

print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1-score:", mean_f1)

## Try Out SVMs to see if that will help

In [None]:
def cross_validate_SVM(X, y, degree, num_iter=10):
    results = {'precision': [], 'recall': [], 'f1': []}
    for i in range(num_iter):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        
        clf = svm.SVC(kernel='poly', degree=degree)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
        
        results['precision'].append(p)
        results['recall'].append(r)
        results['f1'].append(f)
    
    return results


In [None]:
f1_scores = []

for i in range(1, 6):
    cv_results = cross_validate_SVM(X, y, i)
    
    mean_f1 = sum(cv_results['f1']) / len(cv_results['f1'])
    
    f1_scores.append(mean_f1)
    
    print("Degree", i)
    print("Mean F1-score:", mean_f1)

plt.plot(range(1, 6), f1_scores, marker='o')
plt.title('Mean F1-score vs. Degree of SVM')
plt.xlabel('Degree of SVM')
plt.ylabel('Mean F1-score')
plt.grid(True)
plt.show()
