In [1]:
import os
import pandas as pd
import json
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR,LinearSVR
from sklearn.preprocessing import Normalizer
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import matplotlib

In [2]:
#these are the columns from the csv we will be reading
COLS = ['Species', 'Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']
#this is the column with the value we are trying to predict
Y_COL = ['Weight']
X_COLS = []
unique_fish = []

In [3]:
# csv_path = r"fish_participant.csv"
# validation_data_path=r"fish_holdout_demo.csv"
csv_path = os.path.join("fish_participant.csv")
validation_data_path = os.path.join("fish_holdout_demo.csv")

In [4]:
def PickBestModelAndProcessor(csv_path, validation_data_path, models=[], processors=[]):
    train_data = BuildDataFrame(csv_path)
    test_data = BuildDataFrame(validation_data_path)
    print("the top 5 rows look like this: ")
    print(train_data.head())
    y_train = train_data.drop(columns=[c for c in COLS if c not in Y_COL], inplace=False)
    x_train_possible = train_data.drop(columns=Y_COL, inplace=False)
    y_test = test_data.drop(columns=[c for c in COLS if c not in Y_COL], inplace=False)
    x_test_possible = test_data.drop(columns=Y_COL, inplace=False)
    MAX_FEATURES_COUNT=x_test_possible.shape[1]
    results_list = []
    best_score = 0
    best_processor = None
    best_model = None
    best_feature_count=None
    for n in range(1,MAX_FEATURES_COUNT+1):
        for i in range(len(processors)):
            print("preprocssing data with preprocessor number:",str(i))
            X_train,X_test = processors[i](x_train_possible, y_train, x_test_possible,n)
            for j in range(len(models)):
                print("training model number "+str(j))
                models[j].fit(X_train, y_train)
                print("done training, we will see our accuracy rate now:")
                results=models[j].score(X_test,y_test)   ##this is ultimately what is needed in the homework is a model pass a X_test and y_test data to
                print("the average deviation from the correct answer is: "+str(results))
                results_list.append(results)
                if results>best_score and results <= 1.0 : #the best result possible is 1.0
                    best_model=models[j]
                    best_processor=processors[i]
                    best_score = results
                    best_features=X_test.columns.to_list()
    return {"best_score":best_score,"best_model":best_model,"best_processor":best_processor,"top_n_features":best_features,"all_results":results_list}

In [5]:
#no data processor
def preprocess0(x,y,x2,n):
    print("no preprocessing will happen in this function")
    return x,x2

In [6]:
#select top n features
def preprocess1(x,y,x2,n):
    print("running processor 1 which will select top n features AND scale the feature values")
    anova_filter = SelectKBest(f_regression, k=n)
    anova_filter.fit(x, y)
    cols = anova_filter.get_support(indices=True)
    print("out of a total of 6 features, the top 5 at predicting the weight are:")
    print(anova_filter.get_support())
    X_train=x.iloc[:,cols]
    X_test=x2.iloc[:,cols]
    print("new training set: ")
    print(X_train.head())
    return  X_train,X_test

In [7]:
##select top n features AND scale the feature values
def preprocess2(x,y,x2,n):
    print("running processor 1 which will select top 5 features AND scale the feature values")
    anova_filter = SelectKBest(f_regression, k=n)
    anova_filter.fit(x, y)
    cols = anova_filter.get_support(indices=True)
    print("out of a total of 6 features, the top 5 at predicting the weight are:")
    print(anova_filter.get_support())
    X_train = x.iloc[:, cols]
    X_test = x2.iloc[:, cols]
    print("new training set: ")
    print(X_train.head())
    scaler = StandardScaler()
    scaler.fit(X_train,y)
    X_train = pd.DataFrame(scaler.transform(X_train), columns= X_train.columns)
    X_test= pd.DataFrame(scaler.transform(X_test), columns= X_test.columns)
    print("New X head values after scaling:")
    print(X_train.head())
    return X_train,X_test

In [8]:
#select top n features AND scale the feature values AND normalize the vector rows
def preprocess3(x,y,x2,n):
    print("running processor 1 which will select top 5 features AND scale the feature values AND normalize the vector rows")
    print("running processor 1 which will select top 5 features AND scale the feature values")
    anova_filter = SelectKBest(f_regression, k=n)
    anova_filter.fit(x, y)
    cols = anova_filter.get_support(indices=True)
    print("out of a total of 6 features, the top 5 at predicting the weight are:")
    print(anova_filter.get_support())
    X_train = x.iloc[:, cols]
    X_test = x2.iloc[:, cols]
    print("new training set: ")
    print(X_train.head())
    scaler = StandardScaler()
    scaler.fit(X_train,y)
    X_train = pd.DataFrame(scaler.transform(X_train), columns= X_train.columns)
    X_test= pd.DataFrame(scaler.transform(X_test), columns= X_test.columns)
    print("New X head values after scaling:")
    print(X_train.head())
    normalize = Normalizer()
    normalize.fit(X_train,y)
    X_train=pd.DataFrame(normalize.fit_transform(X_train, y), columns = X_train.columns)
    X_test=pd.DataFrame(normalize.fit_transform(X_test, y), columns = X_test.columns)
    print("normalize vector columns")
    return X_train,X_test

In [9]:
def MapFishName(n): 
    global unique_fish
    if n not in unique_fish:
        unique_fish.append(n)
    return unique_fish.index(n)

In [10]:
def BuildDataFrame(path):
    with open(path, "r", encoding="utf8", errors="ignore") as f:
        return pd.read_csv(f, header='infer', converters={'Species': MapFishName})

In [11]:
def DetermineBestModelAndProcessing():
    processors=[preprocess0,preprocess1,preprocess2,preprocess3]
    model1 = KNeighborsRegressor()
    model2 = RandomForestRegressor()
    model3 = RandomForestRegressor()
    model4 = SVR()
    model5 = LinearSVR()
    models = [model1, model2, model3, model4, model5]
    results=PickBestModelAndProcessor(csv_path, validation_data_path, models=models,processors=processors)
    print(results)

In [12]:
#  DetermineBestModelAndProcessing()
#  Running a series of testes determines that of the models above
#testing revealed the following best methods:
#Best possible score is 1.00, we got 0.99506
#{'best_score': 0.9950688891121598, 'best_model': RandomForestRegressor, 'best_processor':  preprocess0
#we can also graph our values to see which features are most tightly bound to the y value, weight
#for x in ['Species', 'Length1', 'Length2', 'Length3', 'Height', 'Width']:
#     train_data.plot.scatter(x=x,y=['Weight'])
#so lets build our final model and make a method to run questions against it
def BuildAndTrainModel(csv_path,choice_model,DROP_FEATURES=[]):
    SELECTED_X_FEATURES=['Length1', 'Length2', 'Length3', 'Height', 'Width']
    Y_COL = ['Weight']
    all_data=BuildDataFrame(csv_path)
    y = all_data.drop(columns=[c for c in COLS if c not in Y_COL], inplace=False)
    X = all_data.drop(columns=Y_COL+DROP_FEATURES, inplace=False)
    choice_model.fit(X,y)
    return choice_model

In [13]:
DROP_FEATURE=['Species']
mymodel=RandomForestRegressor(n_estimators=115,criterion='mse')
trained_model=BuildAndTrainModel(csv_path,mymodel,DROP_FEATURE)  
validation_data=BuildDataFrame(validation_data_path)
y_test=validation_data.drop(columns=[c for c in COLS if c not in Y_COL], inplace=False)
X_test = validation_data.drop(columns=Y_COL+DROP_FEATURE, inplace=False)
print(trained_model.score(X_test,y_test))

  choice_model.fit(X,y)


0.9938525164572038


In [14]:
def PredictASingleX(model,X):#we use this to predict a single weight based on a single input of x features.  X must in format of [[a,b,c,d]] or as a dataframe
    print(model.predict(X))

In [15]:
# mean_squared_error([y_true], [y_pred])
def ScoreBasedOnMeanSquareError(model,X,y_true):  #returns score based on mean square error calculation
    prediction=model.predict(X)
    mse=mean_squared_error(y_true,prediction)
    print(mse)
    return mse

In [16]:
reshape = lambda x: x.ravel().reshape(1,-1)
scores1=[]
for i in range(X_test.shape[0]):
    result = ScoreBasedOnMeanSquareError(trained_model, X_test,y_test)
    scores1.append(result)


574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
574.0125655144478
