#Q1

In [36]:
# Importing libraries and loading dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('50_Startups (1).csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [37]:
# Preprocessing data

# Inspect the dataset
data.head()

# Check for missing values
data.isnull().sum()

# Convert 'State'variable to dummy variables
data = pd.get_dummies(data, columns=['State'], drop_first=True)

# Define features and target variable
X = data[['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida', 'State_New York']]
y = data['Profit']

In [27]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Single feature regression (e.g., 'R&D Spend')
X_train_single = X_train[['R&D Spend']]
X_test_single = X_test[['R&D Spend']]

# Create and fit the model
model_single = LinearRegression()
model_single.fit(X_train_single, y_train)

# Predict on the test set
y_pred_single = model_single.predict(X_test_single)

# Evaluate the model
mse_single = mean_squared_error(y_test, y_pred_single)
r2_single = r2_score(y_test, y_pred_single)

print(f"Single Regression - Mean Squared Error (MSE): {mse_single:.2f}")
print(f"Single Regression - R-squared (R^2): {r2_single:.2f}")

accuracy = 1-mse_single / y_test.var()
print('Accuracy: ', accuracy)

Mean Squared Error (MSE): 59510962.81
R-squared (R^2): 0.93


In [31]:
# Create and fit the multiple regression model
model_multiple = LinearRegression()
model_multiple.fit(X_train, y_train)

# Predict on the test set
y_pred_multiple = model_multiple.predict(X_test)

# Evaluate the model
mse_multiple = mean_squared_error(y_test, y_pred_multiple)
r2_multiple = r2_score(y_test, y_pred_multiple)

print(f"Multiple Regression - Mean Squared Error (MSE): {mse_multiple:.2f}")
print(f"Multiple Regression - R-squared (R^2): {r2_multiple:.2f}")

accuracy = 1-mse_multiple / y_test.var()
print('Accuracy: ', accuracy)

Multiple Regression - Mean Squared Error (MSE): 82010363.04
Multiple Regression - R-squared (R^2): 0.90
Accuracy:  0.9088539772895774


# Q2

In [38]:
# Importing libraries and loading dataset
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('ToyotaCorolla.csv')

# Display the first few rows of the dataset
data.head()

# Select certain columns
selected_columns = ["Price", "Age_08_04", "KM", "HP", "cc", "Doors", "Gears", "Quarterly_Tax", "Weight"]
data = data[selected_columns]

# Display the updated dataframe
data.head()

Unnamed: 0,Price,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight
0,13500,23,46986,90,2000,3,5,210,1165
1,13750,23,72937,90,2000,3,5,210,1165
2,13950,24,41711,90,2000,3,5,210,1165
3,14950,26,48000,90,2000,3,5,210,1165
4,13750,30,38500,90,2000,3,5,210,1170


In [40]:
# Preprocessing Data
# Check for missing values
data.isnull().sum()

# drop rows with any missing values.
data = data.dropna()

# Display the cleaned DataFrame
data.head()

Unnamed: 0,Price,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight
0,13500,23,46986,90,2000,3,5,210,1165
1,13750,23,72937,90,2000,3,5,210,1165
2,13950,24,41711,90,2000,3,5,210,1165
3,14950,26,48000,90,2000,3,5,210,1165
4,13750,30,38500,90,2000,3,5,210,1170


In [41]:
# Split the Data into Features and Target Variable
# Define features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

In [42]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train:  (1148, 8)
Shape of X_test:  (288, 8)
Shape of y_train:  (1148,)
Shape of y_test: (288,)


In [20]:
# Train the Linear Regression Model
from sklearn.linear_model import LinearRegression

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [22]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Root Mean Squared Error
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared : {r2:.2f}")

# Accuracy
mse = mean_squared_error(y_test, y_pred)
accuracy = 1 - mse / y_test.var()
print('Accuracy: ', accuracy)

Mean Absolute Error (MAE): 995.59
Mean Squared Error (MSE): 1950244.99
Root Mean Squared Error (RMSE): 1396.51
R-squared : 0.85
Accuracy:  0.8543427969541446
