### Implementing an Algorithm

- Import the scikit-learn library for linear regression.
- Instantiate the model.

In [1]:
#Import SciKitLearn Algorithm
from sklearn.linear_model import LinearRegression

# Instantiate the model
model = LinearRegression()

### Training the Model in Practice

- Read in the CSV file for the dataset.
- Organize and structure the data in a DataFrame.
- Select X & y training subsets.
- Train the model.

In [2]:
# Import pandas read csv function.
from pandas import DataFrame as DF, read_csv

# Read in the training data csv file.
dataset = read_csv("Training the Model in Practice/no_null_encoded_titanic.csv")

# Structure the data for training the model.
dataset = dataset.drop(columns=["Unnamed: 0.1", "Unnamed: 0", "alive"])

"""
FOR THESE IT MAY BE MORE STURDY TO REFERENCE BY NAME IN CASE SOMETHING HAPPENS WHERE INDEXES ARE SWITCHED BY ACCIDENT
INDEX REFERENCE CAN BE MORE ACCIDENT PRONE
"""

# Create Subsets of the dataset to train the model on. (X & y selection)
target_column_name = "survived"
y_training = dataset[target_column_name]
X_training = dataset.drop(columns=target_column_name)

# View the training subsets.
#display(X_training)
#display(y_training)

# Train the model on the subsets of data that were created.1

### Applying a Model

- Setup and train a classification model using the Titanic dataset.
- Setup and train a Regression model using the House cleaning dataset.

In [3]:
# Import the Decision Tree and Linear Regression Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split as tts

# Import pandas functions.
from pandas import concat
import pandas as pd

# Set pandas options
pd.options.display.float_format = '{:10,.2f}'.format

# Variables
training_percentage = 0.75
# TRAIN TEST SPLIT IS SIMPLEST METHOD FOR ... TRAIN TEST SPLIT
# NO NEED TO INIT VARS IN PYTHON

# Instantiate the models.
DT_Model = DecisionTreeClassifier()
LR_Model = LinearRegression()

# Read in the Datasets
DT_dataset = read_csv("Applying a Model/no_null_encoded_titanic.csv")
LR_dataset = read_csv("Applying a Model/kc_house_cleaned (2).csv")

# Display the datasets
#display(DT_dataset)
#display(LR_dataset)

# Structure and organize the datasets.
DT_dataset = DT_dataset.drop(columns=["Unnamed: 0.1", "Unnamed: 0", "alive"])
LR_dataset = LR_dataset.drop(columns=["Unnamed: 0"])
# INDEX BASED COLUMN REFERENCE IS RISKY SINCE INDEXES CAN SOMETIMES CHANGE BY ACCIDENT DURING DATASET MANIPULATION
# TEXT LABEL BASED INDEXING ADAPTS TO ANY INDEX CHANGES THAT MIGHT OCCUR IF YOU NEEDED TO REFACTOR THE CODE

# Set target features
DT_target_data_feature = "survived"
LR_target_data_feature = "price"

#train, test = tts(DT_dataset, DT_target_data_feature, [""<""<""<""<""]) 

#X_train = train.drop(DT_target_data_feature)
#y_train = train[DT_target_data_feature]

# Slice datasets for training and validation.
DT_pivot = round(len(DT_dataset.index)*training_percentage)
LR_pivot = round(len(LR_dataset.index)*training_percentage)
DT_train = DT_dataset.iloc[:DT_pivot]
DT_test = DT_dataset.iloc[DT_pivot:]
LR_train = LR_dataset.iloc[:LR_pivot]
LR_test = LR_dataset.iloc[LR_pivot:]
# SEVERAL SHORTCUTS HERE:
    #index[DT_pivot:len(DT_dataset.index)]
        # =>
        # index[DT_pivot:]
    # index[0:DT_pivot]
        # =>
        # index[:DT_pivot]
    # DT_dataset.drop(DT_dataset.index[DT_pivot:len(DT_dataset.index)])
        # =>
        # DT_dataset.iloc[DT_pivot:len(DT_dataset.index)]
            # =>
            # DT_dataset.iloc[DT_pivot:]

# Build subsets for validation.
X_DT_Testing = DT_test.drop(columns=DT_target_data_feature)

y_DT_Testing = DF(DT_test[DT_target_data_feature]) 

""" 
SHOULDN'T NORMALLY BE NECESSARY TO RECAST AS A DATAFRAME IN THIS PARTICULAR CONTEXT - MAY WANT TO CHECK OUT 1-1 IF THIS DOES CRASH
"""

y_DT_Testing.rename(columns={DT_target_data_feature:"Known Answers"}, inplace=True)
y_DT_Testing.reset_index(drop=True, inplace=True)

"""
RESETTING INDEX GENERALLY ISN'T NECESSARY UNLESS YOU ARE COMBINING WITH A PREDICTION SET TO COMPARE ANSWERS AT THE END
HOWEVER IF YOU ARE GOING TO COMBINE WITH A PREDICTION SET FOR SIDE-BY-SIDE COMPS, THEN IT IS NECESSARY, SO GOOD JOB THERE
BUT THEN THE RELATION OF THE LABELS TO THE X (INPUTS) IS LOST, IF YOU WANT THOSE. JUST DEPENDS.
"""

X_LR_Testing = LR_test.drop(columns=LR_target_data_feature)

y_LR_Testing = DF(LR_test[LR_target_data_feature])

y_LR_Testing.rename(columns={LR_target_data_feature:"Known Answers"}, inplace=True)
y_LR_Testing.reset_index(drop=True, inplace=True)

"""
SAME COMMENTS AS ABOVE
"""

# Build subsets for training.
X_DT_Training = DT_train.drop(columns=DT_target_data_feature)
y_DT_Training = DT_train[DT_target_data_feature]

X_LR_Training = LR_train.drop(columns=LR_target_data_feature)
y_LR_Training = LR_train[LR_target_data_feature]

# Fit models with the training subsets.
DT_Model.fit(X_DT_Training, y_DT_Training)
LR_Model.fit(X_LR_Training, y_LR_Training)

# Test Models with testing subsets.
DT_result = DT_Model.predict(X_DT_Testing)
DT_result = DF(DT_result, columns=["Predictions"])

LR_result = LR_Model.predict(X_LR_Testing)
LR_result = DF(LR_result, columns=["Predictions"])

#y_LR_Testing["predictions"] = LR_result

#display(y_LR_Testing)

# Super-impose known answers to DataFrame.
DT_result = pd.concat([DT_result, y_DT_Testing], axis=1)
LR_result = pd.concat([LR_result, y_LR_Testing], axis=1)
# GOOD SINCE YOU ARE CONCATENATING PREDS TO LABELS, THE EARLIER RESETTING OF THE INDEX WAS THE RIGHT CHOICE!

# Display results.
#display(DT_result)
#display(LR_result)




### Testing Accuracy in Practice

- Test the classification model and calculate accuracy percentage.
- Test the Regression model and calculate accuracy percentage.

In [5]:
# Reset results DataFrames from previous exercise to remove concatenations.
DT_result = DT_Model.predict(X_DT_Testing)
LR_result = LR_Model.predict(X_LR_Testing)

# Gather Accuracy Scores.
DT_acc = DT_Model.score(X_DT_Testing, y_DT_Testing)
LR_acc = LR_Model.score(X_LR_Testing, y_LR_Testing)

# Calculate percentages.
DT_per = DT_acc*100
LR_per = LR_acc*100

# Display scores and percentages.

# display("Decision Tree Scores:")
# display(f"Accuracy Score: {DT_acc:.5f}")
# display(f"Accuracy Percentage: {DT_per:.2f}%")
# display("\n")
# display("Linear Regression Scores:")
# display(f"Accuracy Score: {LR_acc:.5f}")
# display(f"Accuracy Percentage: {LR_per:.2f}%")


### Applying a Recall Test

- Import the recall-score function.
- Implement the recall_score function for Decision Tree Model.

In [11]:
from sklearn.metrics import recall_score

DT_Rscore = recall_score(y_DT_Testing, DT_Model.predict(X_DT_Testing), average=None)
#LR_Rscore = recall_score(y_LR_Testing, LR_Model.predict(X_LR_Testing), average=None)

display(DT_Rscore)


array([0.77777778, 0.71428571])