#!/usr/bin/env python

__author__ = "Sushil Sivaram, Megha Gubbala", "Sylvia Nanyangwe"
__copyright__ = "N/A"
__credits__ = ["Isac Artzi", "Dinesh Sthapit", "Ken Ferrell", "James Dzikunu", "Tracy Roth", "Renee Morales"]
__license__ = "ECL"
__maintainer__ = "Sushil Sivaram, Megha Gubbala", "Sylvia Nanyangwe"
__email__ = "SushilSivaram@gmail.com"
__status__ = "Development"

In [37]:

from scipy import stats
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from yellowbrick.features import Rank2D
from yellowbrick.target import ClassBalance
from yellowbrick.classifier import ConfusionMatrix
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import scikitplot as skplt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Define Variables
load_dotenv()
#CSVData = os.getenv('CSVData')
CSVData = os.getenv('CSVOriginal')
DependentVariable = os.getenv('DependentVariable')
head_Value = int(os.getenv('Head_Value'))
testSize = float(os.getenv('test_size'))
randomstate = int(os.getenv('random_state'))


In [None]:

'''
Setup Reusable Functions
'''

# Load Data from CSV
def loadAndExtractData():
    global dataSetUp
    global datasetupUnprocessed
    dataSetUp = pd.read_csv(CSVData)
    datasetupUnprocessed = dataSetUp
    keepcolumns = ['cost_yr','median_income','affordability_ratio', 'ave_fam_size']
    dataSetUp =dataSetUp.filter(keepcolumns)
    for keep in keepcolumns:
        dataSetUp = dataSetUp[dataSetUp[keep].notna()]
    dataSetUp= dataSetUp[(np.abs(stats.zscore(dataSetUp)) < 3).all(axis=1)]
    dataSetUp.to_csv('cleaned.csv', index=False)

#print Info
def showDataHeadAndInfo(headCount):
    print(f"showing head {headCount} values")
    print(datasetupUnprocessed.head(headCount))
    print("**********")
    print("Showing info of dataset")
    print(datasetupUnprocessed.describe(include='all'))

# preProcessing
def preProcessing():
    bins = (0, .2, 3)
    group_names = ['Cant Afford', 'Can Afford']
    dataSetUp[DependentVariable] = pd.cut(dataSetUp[DependentVariable], bins, labels=group_names)
    dataSetUp.to_csv('test.csv')
    label_quality = LabelEncoder()
    dataSetUp[DependentVariable] = label_quality.fit_transform(dataSetUp[DependentVariable])
    showDataHeadAndInfo(head_Value)
    print(dataSetUp[DependentVariable].value_counts())
   # plt.figure()
   # sns.set_theme(style="darkgrid")
   # sns.countplot(y=dataSetUp[DependentVariable])

#plotting
def plotting(dataSetUp, state):
    plt.figure()
    histmedian_income = dataSetUp['median_income'].plot.hist(bins=25, grid=True, rwidth=0.9, color='#607c8e')
    plt.title(f'Histogram of Median Income {state}')
    plt.xlabel('Median Income in $')
    plt.ylabel('Count')
    plt.grid(axis='y', alpha=0.5)
    histmedian_income.figure.savefig(f'.\outputs\histMedianIncome{state}.png')

    plt.figure()
    hist_avg_fam = dataSetUp['ave_fam_size'].plot.hist(bins=25,  grid=True, rwidth=0.9, color='#607c8e')
    plt.title(f'Histogram of Family Size {state}')
    plt.xlabel('Family Size')
    plt.ylabel('Count')
    plt.grid(axis='y', alpha=0.5)
    hist_avg_fam.figure.savefig(f'.\outputs\histavgFamsize{state}.png')

    plt.figure()
    hist_Cost_yr = dataSetUp['cost_yr'].plot.hist(bins=25, grid=True, rwidth=0.9, color='#607c8e')
    plt.title(f'Histogram of Yearly Cost of Food $ {state}')
    plt.xlabel('Yearly Cost $')
    plt.ylabel('Count')
    plt.grid(axis='y', alpha=0.5)
    hist_Cost_yr.figure.savefig(f'.\outputs\histCost{state}.png')

    plt.figure()
    scattermedian_income = dataSetUp.plot.scatter(c='DarkBlue', x='median_income', y = 'cost_yr' )
    plt.title(f'scatterogram of Median Income vs Expenditure {state}')
    plt.xlabel('Median Income in $')
    plt.ylabel('cost_yr')
    plt.grid(axis='y', alpha=0.5)
    scattermedian_income.figure.savefig(f'.\outputs\scatterMedianIncomeVSExpenditure{state}.png')

    plt.figure()
    scattermedian_income = dataSetUp.plot.scatter(c='DarkBlue', x='ave_fam_size', y = 'cost_yr' )
    plt.title(f'scatterogram of Family Size vs Expenditure {state}')
    plt.xlabel('Family Size')
    plt.ylabel('cost_yr')
    plt.grid(axis='y', alpha=0.5)
    scattermedian_income.figure.savefig(f'.\outputs\scatterFamSizeVSExpenditure{state}.png')
    plt.show()




In [None]:

# Load Data from CSV
loadAndExtractData()


In [None]:

# Print Info
showDataHeadAndInfo(head_Value)


In [None]:

# Exploratory plotting
plotting(datasetupUnprocessed , "BeforeProcessing")


In [None]:

# preProcessing
preProcessing()


In [None]:
plotting(dataSetUp, "PostProcessing")

In [None]:

'''
seperate dependent and independent variables
'''

X = dataSetUp.drop(DependentVariable, axis=1)
y = dataSetUp[DependentVariable]

In [None]:
#yellow brick
visualizer = Rank2D(algorithm='pearson')
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.show()
visualizer = ClassBalance(labels=["Cant Afford", "Can Afford"])
visualizer.fit(y)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:

# Train and test with random seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=randomstate)


In [None]:

# Optimizing with standardScaler to minimize bias and normalize values
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
dict_classifiers = {
    "rfc": RandomForestClassifier(n_estimators=200),
    "clf": svm.SVC(),
    "mlpc": MLPClassifier(hidden_layer_sizes=(11, 11, 11), max_iter=500,  random_state=1)
}

In [None]:

for model, model_instantiation in dict_classifiers.items():
    model = model_instantiation
    model.fit(X_train, y_train)
    y_score = model.predict(X_test)
    # yellow brick
    cm = ConfusionMatrix(model, classes=[0,1])
    cm.fit(X_train, y_train)
    cm.score(X_test, y_test)
    cm.show()

    confusion_Matrix = confusion_matrix(y_test, y_score)
    cm = accuracy_score(y_test, y_score)
    print(f"Printing Model details for : {model}\n"
          f"Printing Confusion Matrix\n{confusion_Matrix}\n"
          f"Printing Classification Report\n {classification_report(y_test, y_score)}\n"
          f"****\n"
          f"End of Model\n"
          f"****\n")


In [None]:
    mlpc = MLPClassifier(hidden_layer_sizes=(11, 11, 11), max_iter=500,  random_state=randomstate)
    mlpc.fit(X_train, y_train)

In [None]:
    Xnew = X_test[[0]]
    pred_mlpc = mlpc.predict(X_test)
    ynew = mlpc.predict(Xnew)
    if ynew == 0:
        print(f"I am too poor to afford food")
    else:
        print(f"I will survive")


In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
nb = GaussianNB()
nb.fit(X_train, y_train)
predicted_probas = nb.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, predicted_probas)
plt.show()


In [None]:

Xnew = [[8000, 0, 0]]
ynew = 0
income = 30000
famSize = 3
reduceCostIteratorValue = 50
while ynew == 0:
    Xnew = [[Xnew[0:1][0][0] - 0, income, famSize]]
    Xnew1 = sc.transform(Xnew)
    ynew = mlpc.predict(Xnew1)
    print(f'new ynew: {ynew} for xnew: {Xnew}')
    if ynew == 1:
        print(f"VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV You will Be fine at current cost below "
              f"VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV")
        break
    else:
        print(f"HMM you are spending too much iterating to find you an optimal amount trying {Xnew} ")
        Xnew = [[Xnew[0:1][0][0] - reduceCostIteratorValue, income, famSize]]
        Xnew1 = sc.transform(Xnew)
        ynew = mlpc.predict(Xnew1)

print(f"***********************************************-- "
      f"We suggest you reduce your annual expenditure to ${Xnew[0][0]} for family counts of {Xnew[0][2]} and income of ${Xnew[0][1]}"
      f" --***********************************************")

