### Implementing an Algorithm

- Import the scikit-learn library for linear regression.
- Instantiate the model.

In [100]:
#Import SciKitLearn Algorithm
from sklearn.linear_model import LinearRegression

# Instantiate the model
model = LinearRegression()

### Training the Model in Practice

- Read in the CSV file for the dataset.
- Organize and structure the data in a DataFrame.
- Select X & y training subsets.
- Train the model.

In [101]:
# Import pandas read csv function.
from pandas import DataFrame as DF, read_csv

# Read in the training data csv file.
dataset = read_csv("Training the Model in Practice/no_null_encoded_titanic.csv")

# Structure the data for training the model.
dataset = dataset.drop(columns=[dataset.columns[0], dataset.columns[1], "alive"])

# Create Subsets of the dataset to train the model on. (X & y selection)
target_column_name = "survived"
y_training = dataset[target_column_name]
X_training = dataset.drop(columns=target_column_name)

# View the training subsets.
#display(X_training)
#display(y_training)

# Train the model on the subsets of data that were created.
model.fit(X_training, y_training)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Applying a Model

- Setup and train a classification model using the Titanic dataset.
- Setup and train a Regression model using the House cleaning dataset.

In [129]:
# Import the Decision Tree and Linear Regression Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression

# Import pandas functions.
from pandas import concat
import pandas as pd

# Set pandas options
pd.options.display.float_format = '{:10,.2f}'.format

# Variables
training_percentage = 0.75
DT_target_data_feature = ""
LR_target_data_feature = ""

# Instantiate the models.
DT_Model = DecisionTreeClassifier()
LR_Model = LinearRegression()

# Read in the Datasets
DT_dataset = read_csv("Applying a Model/no_null_encoded_titanic.csv")
LR_dataset = read_csv("Applying a Model/kc_house_cleaned (2).csv")

# Display the datasets
#display(DT_dataset)
#display(LR_dataset)

# Structure and organize the datasets.
DT_dataset = DT_dataset.drop(columns=[DT_dataset.columns[0], DT_dataset.columns[1], "alive"])
LR_dataset = LR_dataset.drop(columns=[LR_dataset.columns[0]])

# Set target features
DT_target_data_feature = "survived"
LR_target_data_feature = "price"

# Slice datasets for training and validation.
DT_pivot = round(len(DT_dataset.index)*training_percentage)
LR_pivot = round(len(LR_dataset.index)*training_percentage)
DT_train = DT_dataset.drop(DT_dataset.index[DT_pivot:len(DT_dataset.index)])
DT_test = DT_dataset.drop(DT_dataset.index[0:DT_pivot])
LR_train = LR_dataset.drop(LR_dataset.index[LR_pivot:len(LR_dataset.index)])
LR_test = LR_dataset.drop(LR_dataset.index[0:LR_pivot])

# Build subsets for validation.
X_DT_Testing = DT_test.drop(columns=DT_target_data_feature)

y_DT_Testing = DF(DT_test[DT_target_data_feature])
y_DT_Testing.rename(columns={DT_target_data_feature:"Known Answers"}, inplace=True)
y_DT_Testing.reset_index(drop=True, inplace=True)

X_LR_Testing = LR_test.drop(columns=LR_target_data_feature)

y_LR_Testing = DF(LR_test[LR_target_data_feature])
y_LR_Testing.rename(columns={LR_target_data_feature:"Known Answers"}, inplace=True)
y_LR_Testing.reset_index(drop=True, inplace=True)

# Build subsets for training.
X_DT_Training = DT_train.drop(columns=DT_target_data_feature)
y_DT_Training = DT_train[DT_target_data_feature]

X_LR_Training = LR_train.drop(columns=LR_target_data_feature)
y_LR_Training = LR_train[LR_target_data_feature]

# Fit models with the training subsets.
DT_Model.fit(X_DT_Training, y_DT_Training)
LR_Model.fit(X_LR_Training, y_LR_Training)

# Test Models with testing subsets.
DT_result = DT_Model.predict(X_DT_Testing)
DT_result = DF(DT_result, columns=["Predictions"])

LR_result = LR_Model.predict(X_LR_Testing)
LR_result = DF(LR_result, columns=["Predictions"])

# Super-impose known answers to DataFrame.
DT_result = pd.concat([DT_result, y_DT_Testing], axis=1)
LR_result = pd.concat([LR_result, y_LR_Testing], axis=1)

# Display results.
display(DT_result)
display(LR_result)


Unnamed: 0,Predictions,Known Answers
0,0,0
1,0,1
2,1,1
3,0,0
4,0,0
...,...,...
173,0,0
174,0,0
175,1,1
176,0,1


Unnamed: 0,Predictions,Known Answers
0,952059.24,714000.00
1,1146416.87,970000.00
2,17899.19,280000.00
3,263739.22,260000.00
4,437961.08,294000.00
...,...,...
4684,817967.23,400000.00
4685,458480.94,529941.00
4686,223834.19,310000.00
4687,223834.19,389000.00
