In [None]:
                                          #                 Theoretical

In [None]:
# What does R-squared represent in a regression model?

R-squared, or the coefficient of determination, represents the proportion of the variance in the dependent variable that is predictable from the independent variables. It ranges from 0 to 1, where 1 indicates that the model perfectly explains the variance, and 0 indicates that the model explains none of the variance.

from sklearn.metrics import r2_score
# Example data
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
r_squared = r2_score(y_true, y_pred)
print("R-squared:", r_squared)


In [None]:
# What are the assumptions of linear regression?

inear regression makes the following assumptions:

Linearity: The relationship between the independent and dependent variable should be linear.
Independence: Observations are independent of each other.
Homoscedasticity: The residuals have constant variance.
Normality of residuals: The residuals of the model should be normally distributed.
No multicollinearity: Independent variables should not be highly correlated.

Code to check assumptions:

Check linearity using scatter plots.
Check homoscedasticity with a residual vs. fitted plot.
Check normality of residuals using a Q-Q plot.

import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np

# Residuals Normality Check
residuals = np.random.normal(0, 1, 100)  # Dummy residuals
sm.qqplot(residuals, line ='45')
plt.show()


In [None]:
# What is the difference between R-squared and Adjusted R-squared?

R-squared only considers the proportion of variance explained by the model.

Adjusted R-squared adjusts for the number of predictors, penalizing the model for adding non-useful predictors.

import statsmodels.api as sm

X = [[1, 2], [2, 3], [3, 4], [4, 5]]
y = [2, 3, 5, 7]
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

print("R-squared:", model.rsquared)
print("Adjusted R-squared:", model.rsquared_adj)


In [None]:
# Why do we use Mean Squared Error (MSE)?

MSE is used to measure the average of the squares of the errors. The squaring of errors gives more weight to larger errors, helping to penalize models that make significant mistakes.

from sklearn.metrics import mean_squared_error

y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mse = mean_squared_error(y_true, y_pred)
print("Mean Squared Error:", mse)



In [None]:
# What does an Adjusted R-squared value of 0.85 indicate?

An Adjusted R-squared value of 0.85 means that 85% of the variance in the dependent variable is explained by the model, accounting for the number of predictors. This shows a strong fit.

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# Sample dataset
np.random.seed(42)
X = np.random.rand(100, 5)  # 100 samples and 5 features
y = 3 * X[:, 0] + 2 * X[:, 1] + np.random.randn(100)  # Target variable with some noise

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Add a constant to the input data
X_train_sm = sm.add_constant(X_train)

# Fit the OLS (Ordinary Least Squares) model
ols_model = sm.OLS(y_train, X_train_sm).fit()

# Print R-squared and Adjusted R-squared
print("R-squared:", ols_model.rsquared)
print("Adjusted R-squared:", ols_model.rsquared_adj)
# Add a constant to the input data
X_train_sm = sm.add_constant(X_train)

# Fit the OLS (Ordinary Least Squares) model
ols_model = sm.OLS(y_train, X_train_sm).fit()

# Print R-squared and Adjusted R-squared
print("R-squared:", ols_model.rsquared)
print("Adjusted R-squared:", ols_model.rsquared_adj)


In [None]:
# How do we check for normality of residuals in linear regression?

We check normality by plotting the residuals and using statistical tests.
# Q-Q plot for normality
sm.qqplot(residuals, line='45')
plt.show()


In [None]:
# What is multicollinearity, and how does it impact regression?

Multicollinearity occurs when two or more independent variables are highly correlated. It can make the regression coefficients unstable and difficult to interpret.

import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assume df is a DataFrame with your predictor variables
X = pd.DataFrame([[1, 2], [2, 3], [3, 4], [4, 5]])
vif_data = pd.DataFrame()
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)


In [None]:
# What is Mean Absolute Error (MAE)?

MAE is the average of the absolute differences between actual and predicted values. Unlike MSE, it does not penalize larger errors more than smaller ones.

from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_true, y_pred)
print("Mean Absolute Error:", mae)


In [None]:
# What are the benefits of using an ML pipeline?

ML pipelines automate and streamline workflows, improving consistency and reproducibility. Pipelines allow the combination of data preprocessing steps with model training and evaluation.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

pipeline = Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())])
pipeline.fit(X, y)


In [None]:
# Why is RMSE considered more interpretable than MSE?

RMSE is more interpretable because it is in the same units as the dependent variable, making it easier to understand in the context of the original data.

from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print("Root Mean Squared Error:", rmse)


In [None]:
# What is pickling in Python, and how is it useful in ML?

Pickling is the process of serializing a Python object. It is useful in ML for saving trained models to be reused later without retraining.
import pickle

# Saving the model
with open('model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

# Loading the model
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)


In [None]:
# What does a high R-squared value mean?

A high R-squared value indicates that a large proportion of the variance in the dependent variable (target) is explained by the independent variables (features). A value closer to 1 means the model fits the data well, but it doesn't necessarily mean the model is perfect. It may also be a sign of overfitting.

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Sample data
X = [[1], [2], [3], [4]]
y = [1, 2, 3, 4]

# Linear Regression
model = LinearRegression().fit(X, y)
y_pred = model.predict(X)
r_squared = r2_score(y, y_pred)

print(f"R-squared: {r_squared}")



In [None]:
# What happens if linear regression assumptions are violated?

Violation of Linearity: The model will fail to capture the true relationship between variables, leading to inaccurate predictions.

Violation of Homoscedasticity: Results in inefficient estimates and biased inferences.

Violation of Normality of Residuals: Can lead to incorrect confidence intervals and p-values.

Violation of No Multicollinearity: Can make it difficult to identify the individual effect of predictors.

Violation of Independence: Can lead to over- or underestimation of the significance of variables.

import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt

# Residual Plot to check Homoscedasticity
residuals = np.random.normal(0, 1, 100)  # Dummy residuals
plt.scatter(range(100), residuals)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.title("Residual Plot to Check Homoscedasticity")
plt.show()

# Q-Q plot to check Normality
sm.qqplot(residuals, line='45')
plt.show()


In [None]:
# How can we address multicollinearity in regression?

Multicollinearity occurs when independent variables are highly correlated. It can be addressed by:

Removing correlated variables: Dropping one of the correlated variables.
Using Principal Component Analysis (PCA): To reduce dimensionality.
Ridge or Lasso Regression: Regularization methods that penalize multicollinearity.

import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Sample Data
X = pd.DataFrame({
    'feature_1': [1, 2, 3, 4, 5],
    'feature_2': [2, 4, 6, 8, 10],  # Highly correlated with feature_1
    'feature_3': [1, 3, 5, 7, 9]
})

# Calculating VIF
vif_data = pd.DataFrame()
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data["Feature"] = X.columns
print(vif_data)


In [None]:
# How can feature selection improve model performance in regression analysis?

Feature selection removes irrelevant or redundant features, which can:

Reduce overfitting.

Improve model interpretability.

Speed up computation.

Avoid multicollinearity.

Methods of feature selection include Recursive Feature Elimination (RFE) and Regularization techniques like Lasso Regression.

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Sample Data
X = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9], [1.0, 1.1, 1.2]]
y = [1, 2, 3, 4]

# Create a model and apply RFE
model = LinearRegression()
selector = RFE(model, n_features_to_select=2)
selector = selector.fit(X, y)

print("Selected Features:", selector.support_)
print("Feature Ranking:", selector.ranking_)




In [None]:
# How is Adjusted R-squared calculated?

Adjusted R-squared is a modification of R-squared that adjusts for the number of predictors in the model. It is calculated as:

Adjusted
𝑅
2
=
1
−
(
(
1
−
𝑅
2
)
(
𝑛
−
1
)
𝑛
−
𝑝
−
1
)
Adjusted R
2
 =1−(
n−p−1
(1−R
2
 )(n−1)
​
 )
Where:

𝑛
n is the number of data points.
𝑝
p is the number of predictors.

import statsmodels.api as sm

X = sm.add_constant(X)  # Adding constant for intercept
model = sm.OLS(y, X).fit()
print(f"R-squared: {model.rsquared}")
print(f"Adjusted R-squared: {model.rsquared_adj}")


In [None]:
# Why is MSE sensitive to outliers?

Mean Squared Error (MSE) squares the error values, which means larger errors (such as those caused by outliers) have a disproportionately high impact. This can skew the error metric, making it sensitive to outliers.

from sklearn.metrics import mean_squared_error

y_true = [3, -0.5, 2, 7, 10]
y_pred = [2.5, 0.0, 2, 8, 50]  # Last value is an outlier

mse = mean_squared_error(y_true, y_pred)
print("MSE with Outliers:", mse)


In [None]:
# What is the role of homoscedasticity in linear regression?

Homoscedasticity means that the variance of the residuals is constant across all levels of the independent variables. Violating this assumption (heteroscedasticity) can result in inefficient estimates, affecting the reliability of the model’s coefficients and predictions.

import matplotlib.pyplot as plt
residuals = y_true - y_pred
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted")
plt.show()


In [None]:
# What is Root Mean Squared Error (RMSE)?

RMSE is the square root of the Mean Squared Error (MSE), which makes it easier to interpret because it is in the same units as the dependent variable. It provides a good measure of how well the model’s predictions match the actual data.

import numpy as np
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print("RMSE:", rmse)


In [None]:
# Why is pickling considered risky?

Pickling can be risky because it allows for arbitrary code execution when loading the serialized object. If the pickle file is maliciously altered, it can lead to a security vulnerability by executing harmful code during deserialization.

In [None]:
# What alternatives exist to pickling for saving ML models?

Safer alternatives to pickling include:

Joblib: Efficient for large models or arrays.
HDF5 (via TensorFlow/Keras): Commonly used for deep learning models.
ONNX (Open Neural Network Exchange): Cross-platform, format-agnostic approach for saving models.


import joblib

# Saving the model
joblib.dump(model, 'model.joblib')

# Loading the model
loaded_model = joblib.load('model.joblib')


In [None]:
# What is heteroscedasticity, and why is it a problem?

Heteroscedasticity occurs when the variance of residuals is not constant. This violates one of the key assumptions of linear regression, leading to inefficient estimates and biased standard errors, which can affect the validity of hypothesis tests.

from statsmodels.stats.diagnostic import het_breuschpagan

# Performing Breusch-Pagan test
bp_test = het_breuschpagan(residuals, X)
print(f"Breusch-Pagan Test p-value: {bp_test[1]}")


In [None]:
# How can interaction terms enhance a regression model's predictive power?

Interaction terms allow the model to capture the combined effect of two or more variables. In many cases, the impact of one feature on the dependent variable depends on the value of another feature. Adding interaction terms can improve the model's predictive power by incorporating these joint effects.

from sklearn.preprocessing import PolynomialFeatures

# Adding interaction terms
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_interaction = poly.fit_transform(X)

# Fit the model with interaction terms
model_interaction = LinearRegression().fit(X_interaction, y)


In [None]:
                                                      #          Practical

In [None]:
#  Write a Python script to visualize the distribution of errors (residuals) for a multiple linear regression model using Seaborn's "diamonds" dataset.

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load dataset
diamonds = sns.load_dataset('diamonds')

# Select features and target variable
X = diamonds[['carat', 'depth', 'table']]
y = diamonds['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and calculate residuals
y_pred = model.predict(X_test)
residuals = y_test - y_pred

# Plot the distribution of residuals
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.title('Distribution of Residuals')
plt.show()


In [None]:
#  Write a Python script to calculate and print Mean Squared Error (MSE), Mean Absolute Error (MAE), and Root Mean Squared Error (RMSE) for a linear regression model.

from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Calculate MSE, MAE, RMSE
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the metrics
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")


In [None]:
# Write a Python script to check if the assumptions of linear regression are met. Use a scatter plot to check linearity, residuals plot for homoscedasticity, and correlation matrix for multicollinearity.

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Linearity (scatter plot)
sns.pairplot(diamonds[['carat', 'depth', 'table', 'price']])
plt.show()

# Homoscedasticity (residuals plot)
sns.residplot(x=y_pred, y=residuals, lowess=True)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residuals vs Fitted")
plt.show()

# Multicollinearity (correlation matrix and VIF)
corr_matrix = X_train.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

# VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train.columns
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
print(vif_data)


In [None]:
# Write a Python script that creates a machine learning pipeline with feature scaling and evaluates the performance of different regression models.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regression', Ridge())
])

# Evaluate model performance using cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print("Cross-validated R-squared scores:", scores)
print("Mean R-squared:", scores.mean())


In [None]:
#  Implement a simple linear regression model on a dataset and print the model's coefficients, intercept, and R-squared score.

# Train simple linear regression
simple_model = LinearRegression()
simple_model.fit(X_train[['carat']], y_train)

# Print coefficients, intercept, and R-squared score
print(f"Coefficient: {simple_model.coef_[0]}")
print(f"Intercept: {simple_model.intercept_}")
print(f"R-squared: {simple_model.score(X_test[['carat']], y_test)}")


In [None]:
#  Write a Python script that analyzes the relationship between total bill and tip in the 'tips' dataset using simple linear regression and visualizes the results.

tips = sns.load_dataset('tips')

# Define X and y
X = tips[['total_bill']]
y = tips['tip']

# Train the model
model = LinearRegression()
model.fit(X, y)

# Plot regression line
sns.regplot(x='total_bill', y='tip', data=tips, line_kws={"color": "red"})
plt.show()


In [None]:
# Write a Python script that fits a linear regression model to a synthetic dataset with one feature. Use the model to predict new values and plot the data points along with the regression line.

import numpy as np

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 2.5 * X + np.random.randn(100, 1)

# Train the model
model = LinearRegression()
model.fit(X, y)

# Predict values
y_pred = model.predict(X)

# Plot data and regression line
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.show()


In [None]:
# Write a Python script that pickles a trained linear regression model and saves it to a file.

import pickle

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Save model to file
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Load model from file
with open('linear_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [None]:
#  Write a Python script that fits a polynomial regression model (degree 2) to a dataset and plots the regression curve.

from sklearn.preprocessing import PolynomialFeatures

# Polynomial transformation
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

# Train polynomial regression model
model_poly = LinearRegression()
model_poly.fit(X_poly, y)

# Plot polynomial regression curve
plt.scatter(X, y, color='blue')
plt.plot(X, model_poly.predict(X_poly), color='red')
plt.show()


In [None]:
# Generate synthetic data for simple linear regression (use random values for X and y) and fit a linear regression model to the data. Print the model's coefficient and intercept

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 4 + 3 * X + np.random.randn(100, 1)

# Train the model
model = LinearRegression()
model.fit(X, y)

# Print coefficient and intercept
print(f"Coefficient: {model.coef_[0][0]}")
print(f"Intercept: {model.intercept_[0]}")


In [None]:
# Write a Python script that fits polynomial regression models of different degrees to a synthetic dataset and compares their performance

degrees = [1, 2, 3]
for degree in degrees:
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)
    model = LinearRegression()
    model.fit(X_poly, y)
    score = model.score(X_poly, y)
    print(f"Degree {degree}: R-squared = {score}")


In [None]:
# . Write a Python script that fits a simple linear regression model with two features and prints the model's coefficients, intercept, and R-squared score.

# Select two features
X = diamonds[['carat', 'depth']]

# Train model
model = LinearRegression()
model.fit(X, y)

# Print coefficients, intercept, and R-squared score
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
print(f"R-squared: {model.score(X, y)}")


In [None]:
# Write a Python script that generates synthetic data, fits a linear regression model, and visualizes the regression line along with the data points.

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 3 + 2 * X + np.random.randn(100, 1)

# Train model
model = LinearRegression()
model.fit(X, y)

# Predict values
y_pred = model.predict(X)

# Plot data and regression line
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.show()


In [None]:
#  Write a Python script that uses the Variance Inflation Factor (VIF) to check for multicollinearity in a dataset with multiple features.

import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Load dataset
diamonds = sns.load_dataset('diamonds')

# Select relevant features
X = diamonds[['carat', 'depth', 'table']]

# Add constant column for VIF calculation
X_const = add_constant(X)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]

print(vif_data)


In [None]:
#  Write a Python script that generates synthetic data for a polynomial relationship (degree 4), fits a polynomial regression model, and plots the regression curve.

from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Generate synthetic data
X = np.linspace(-3, 3, 100).reshape(-1, 1)
y = 3 * X**4 - 2 * X**3 + X**2 + 5 + np.random.randn(100, 1)

# Polynomial transformation (degree 4)
poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)

# Fit polynomial regression model
model = LinearRegression()
model.fit(X_poly, y)

# Plot the regression curve
plt.scatter(X, y, color='blue')
plt.plot(X, model.predict(X_poly), color='red')
plt.show()


In [None]:
#  Write a Python script that creates a machine learning pipeline with data standardization and a multiple linear regression model, and prints the R-squared score.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load dataset
diamonds = sns.load_dataset('diamonds')

# Select features and target
X = diamonds[['carat', 'depth', 'table']]
y = diamonds['price']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regression', LinearRegression())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
print(f"R-squared: {r2_score(y_test, y_pred)}")


In [None]:
# Write a Python script that performs polynomial regression (degree 3) on a synthetic dataset and plots the regression curve.

# Generate synthetic data
X = np.linspace(-3, 3, 100).reshape(-1, 1)
y = 2 * X**3 - X**2 + 4 + np.random.randn(100, 1)

# Polynomial transformation (degree 3)
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)

# Fit polynomial regression model
model = LinearRegression()
model.fit(X_poly, y)

# Plot the regression curve
plt.scatter(X, y, color='blue')
plt.plot(X, model.predict(X_poly), color='red')
plt.show()


In [None]:
# Write a Python script that performs multiple linear regression on a synthetic dataset with 5 features. Print the R-squared score and model coefficients.

# Generate synthetic data
X = np.random.rand(100, 5)
y = 2 + 3 * X[:, 0] - 4 * X[:, 1] + 5 * X[:, 2] - X[:, 3] + 2 * X[:, 4] + np.random.randn(100)

# Train multiple linear regression model
model = LinearRegression()
model.fit(X, y)

# Print R-squared and coefficients
print(f"R-squared: {model.score(X, y)}")
print(f"Coefficients: {model.coef_}")


In [None]:
#  Write a Python script that generates synthetic data for linear regression, fits a model, and visualizes the data points along with the regression line.

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 3 + 2 * X + np.random.randn(100, 1)

# Train model
model = LinearRegression()
model.fit(X, y)

# Predict values
y_pred = model.predict(X)

# Plot data and regression line
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.show()


In [None]:
#  Create a synthetic dataset with 3 features and perform multiple linear regression. Print the model's Rsquared score and coefficients.

# Generate synthetic data
X = np.random.rand(100, 3)
y = 5 + 2 * X[:, 0] - 3 * X[:, 1] + X[:, 2] + np.random.randn(100)

# Train model
model = LinearRegression()
model.fit(X, y)

# Print R-squared and coefficients
print(f"R-squared: {model.score(X, y)}")
print(f"Coefficients: {model.coef_}")


In [None]:
#  Write a Python script that demonstrates how to serialize and deserialize machine learning models using joblib instead of pickling.

import joblib

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Save model using joblib
joblib.dump(model, 'linear_regression_model.joblib')

# Load model using joblib
loaded_model = joblib.load('linear_regression_model.joblib')


In [None]:
#  Write a Python script to perform linear regression with categorical features using one-hot encoding. Use the Seaborn 'tips' dataset.

import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the 'tips' dataset
tips = sns.load_dataset('tips')

# Define features and target
X = tips[['sex', 'day', 'time']]
y = tips['total_bill']

# Create a pipeline with one-hot encoding for categorical variables
pipeline = Pipeline([
    ('encoder', ColumnTransformer([
        ('onehot', OneHotEncoder(), ['sex', 'day', 'time'])
    ])),
    ('regression', LinearRegression())
])

# Train the model
pipeline.fit(X, y)

# Predict and print R-squared
y_pred = pipeline.predict(X)
print(f"R-squared: {pipeline.score(X, y)}")


In [None]:
# Compare Ridge Regression with Linear Regression on a synthetic dataset and print the coefficients and Rsquared score.

from sklearn.linear_model import Ridge

# Generate synthetic data
X = np.random.rand(100, 3)
y = 2 + 3 * X[:, 0] - 2 * X[:, 1] + X[:, 2] + np.random.randn(100)

# Train Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X, y)

# Train Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X, y)

# Print coefficients and R-squared for both models
print("Linear Regression Coefficients:", linear_model.coef_)
print("Linear Regression R-squared:", linear_model.score(X, y))

print("Ridge Regression Coefficients:", ridge_model.coef_)
print("Ridge Regression R-squared:", ridge_model.score(X, y))


In [None]:
# Write a Python script that uses cross-validation to evaluate a Linear Regression model on a synthetic dataset.

from sklearn.model_selection import cross_val_score

# Generate synthetic data
X = np.random.rand(100, 3)
y = 2 + 3 * X[:, 0] - 2 * X[:, 1] + X[:, 2] + np.random.randn(100)

# Create model
model = LinearRegression()

# Evaluate using cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print("Cross-validated R-squared scores:", scores)
print("Mean R-squared:", scores.mean())


In [None]:
# Write a Python script that compares polynomial regression models of different degrees and prints the Rsquared score for each.

# Generate synthetic data
X = np.linspace(-3, 3, 100).reshape(-1, 1)
y = 2 * X**3 - X**2 + 4 + np.random.randn(100, 1)

# Compare polynomial regression models of degrees 1, 2, 3, 4
for degree in range(1, 5):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)
    
    # Train model
    model = LinearRegression()
    model.fit(X_poly, y)
    
    # Print R-squared score
    print(f"Degree {degree}: R-squared = {model.score(X_poly, y)}")
