# MY LAB
***


In [112]:
import numpy as np
import matplotlib.pyplot as pltb
import pandas as pd

#### IMPORT DATA
***

In [113]:
#load mtcars
dfcars=pd.read_csv("mtcars.csv")
dfcars=dfcars.rename(columns={"Unnamed: 0":"name"})
dfcars.head()

Unnamed: 0,name,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


#### CLEAN DATA
***

In [114]:
# y is the response variable so let's slice that off
y = dfcars['mpg']

In [115]:
#We have multiple possible X's let's put them aside too, we don't want the first two columns
allX = dfcars.iloc[:, 2:]

In [116]:
allX.head()

Unnamed: 0,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,8,360.0,175,3.15,3.44,17.02,0,0,3,2


#### SEPERATE THE DATA 
***

In [117]:
#separate the data into a training set and a test set. We can use train_test_split for this
from sklearn.model_selection import train_test_split

In [118]:
allX_train, allX_test, y_train, y_test = train_test_split(allX, y, test_size=0.2, random_state=42)

#### VALIDATION
***

Cross-validation is a model evaluation technique that helps you measure how well your machine learning model generalizes to unseen data.

Instead of training and testing the model once, cross-validation splits your training data into multiple parts (folds), 
and the model is trained and tested multiple times — each time on a different subset of the data.

In [119]:
from sklearn.linear_model import LinearRegression

In [120]:
from sklearn.model_selection import cross_val_score

In [121]:
# try with one variable "wt"  (model5 0.57)
X_train = allX_train[["wt"]]
X_test = allX_test[["wt"]]

In [122]:
# This performs cross-validation on the linear regression model. 
# It uses the training data: X_train (features) and y_train (target/labels). 

# cross_val_score Splits your training data into k folds (default is 5 folds). 
# It trains the model on k−1 folds, and tests it on the remaining fold, repeating this k times. 

# Each run returns a score (by default, for regression it's R² score — 
# how well the model fits). The result: a list/array of scores, one for each fold.

model5 = LinearRegression()
scores5 = cross_val_score(model5, X_train, y_train)
scores5

array([0.10736385, 0.49488149, 0.86879651, 0.65179584, 0.73272638])

In [123]:
scores5.mean()

0.5711128154689782

In [124]:
# now with two variables (model6 0.63) )
X_train = allX_train[["wt","hp"]]
X_test = allX_test[["wt","hp"]]

In [125]:
model6 = LinearRegression()
scores6 = cross_val_score(model6, X_train, y_train)
scores6

array([0.1262545 , 0.6744817 , 0.76712479, 0.76448681, 0.83404541])

In [126]:
scores6.mean()

0.6332786406507417

In [127]:
# now with three variables (model7 0.65)
X_train = allX_train[["wt","hp","drat"]]
X_test = allX_test[["wt","hp","drat"]]

In [128]:
model7 = LinearRegression()
scores7 = cross_val_score(model7, X_train, y_train)
scores7.mean()

0.6571279909053493

In [129]:
# again with 3 variables but different ones (model8 0.60)
X_train = allX_train[["wt","hp","carb"]]
X_test = allX_test[["wt","hp","carb"]]

In [130]:
model8 = LinearRegression()
scores8 = cross_val_score(model8, X_train, y_train)
scores8.mean()

0.6036599344425267

In [131]:
# best so far is model7 - A score of .79 for the test set. So let's build that model fully and then evaluate that
X_train = allX_train[["wt","hp","drat"]]
X_test = allX_test[["wt","hp","drat"]]
model7 = LinearRegression()
model7.fit(X_train, y_train)
model7.score(X_test, y_test)

0.7900492843805198

In [132]:
# Goal is to train a Linear Regression model to predict a target (y_train, y_test) using a subset of features ("wt", "hp", and "drat"), 
# and then evaluating how well it performs on test data.

# X_train = allX_train[["wt","hp","drat"]]: You're selecting only 3 specific columns ("wt", "hp", "drat") 
# from your full training data allX_train.X_train will now only contain these three features (variables) for training.

# X_test = allX_test[["wt","hp","drat"]]: Same thing, but for the test set. 
# You're ensuring that both training and testing sets use the same features ("wt", "hp", "drat").

# model7 = LinearRegression(): You're creating a new instance of a Linear Regression model from sklearn.linear_model. 
# At this point, the model is empty — it's just created, not yet trained.

# model7.fit(X_train, y_train):Now you're training the model using the training data:X_train: 
# The input features (wt, hp, drat) & y_train: 
# The target/output you're trying to predict. The model learns the best-fitting line/plane in multidimensional space to predict y.

# model7.score(X_test, y_test): You're now evaluating how well the trained model performs on unseen test data. 
# score() by default returns the R² score (coefficient of determination). R² Score: 
# Measures how well the model predicts the target. 
# The Range: 1.0 = perfect prediction / 0.0 = model does no better than the mean / Negative = model is worse than just guessing the mean

#### POLYNOMIAL REGRESSION EXAMPLE
***