Q1. Write a Python script to visualize the distribution of errors (residuals) for a multiple linear regression model
using Seaborn's "diamonds" dataset.

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


diamonds = sns.load_dataset('diamonds')

diamonds = diamonds.select_dtypes(include=np.number).dropna()

X = diamonds.drop(columns='price')
y = diamonds['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, color='blue', bins=30)
plt.title('Distribution of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.axvline(0, color='red', linestyle='--', label='Zero Error')
plt.legend()
plt.show()


Q2. Write a Python script to calculate and print Mean Squared Error (MSE), Mean Absolute Error (MAE), and Root
Mean Squared Error (RMSE) for a linear regression model.

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=1, noise=15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Q3. Write a Python script to check if the assumptions of linear regression are met. Use a scatter plot to check
linearity, residuals plot for homoscedasticity, and correlation matrix for multicollinearity.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate synthetic data
X, y = make_regression(n_samples=100, n_features=3, noise=15, random_state=42)
X = pd.DataFrame(X, columns=['Feature1', 'Feature2', 'Feature3'])
y = pd.Series(y, name='Target')

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
residuals = y_test - y_pred

# Check linearity
plt.figure(figsize=(8, 5))
plt.scatter(y_test, y_pred, alpha=0.7, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Fit')
plt.title('Linearity Check: Actual vs Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

# Check homoscedasticity
plt.figure(figsize=(8, 5))
plt.scatter(y_pred, residuals, alpha=0.7, color='green')
plt.axhline(0, color='red', linestyle='--', label='Zero Residual')
plt.title('Homoscedasticity Check: Residuals vs Predicted')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.legend()
plt.show()

# Check multicollinearity using correlation matrix
correlation_matrix = X.corr()
plt.figure(figsize=(8, 5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix (Multicollinearity Check)')
plt.show()


Q4. Write a Python script that creates a machine learning pipeline with feature scaling and evaluates the
performance of different regression models

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

X, y = make_regression(n_samples=200, n_features=5, noise=10, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42)
}

for name, model in models.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("regressor", model)
    ])
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_test, y_test)
    print(f"{name} R-squared: {score:.2f}")


Q5. Implement a simple linear regression model on a dataset and print the model's coefficients, intercept, and
R-squared score.

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

r_squared = model.score(X_test, y_test)

print(f"Coefficient: {model.coef_[0]:.2f}")
print(f"Intercept: {model.intercept_:.2f}")
print(f"R-squared score: {r_squared:.2f}")


Q6. Write a Python script that analyzes the relationship between total bill and tip in the 'tips' dataset using
simple linear regression and visualizes the results.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

tips = sns.load_dataset('tips')
X = tips[['total_bill']]
y = tips['tip']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

plt.figure(figsize=(8, 5))
plt.scatter(X, y, alpha=0.7, label="Data Points")
plt.plot(X_test, y_pred, color='red', label="Regression Line")
plt.title("Total Bill vs Tip")
plt.xlabel("Total Bill")
plt.ylabel("Tip")
plt.legend()
plt.show()


Q7. Write a Python script that fits a linear regression model to a synthetic dataset with one feature. Use the
model to predict new values and plot the data points along with the regression line.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)

model = LinearRegression()
model.fit(X, y)

X_new = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_pred = model.predict(X_new)

plt.figure(figsize=(8, 5))
plt.scatter(X, y, alpha=0.7, label="Data Points")
plt.plot(X_new, y_pred, color='red', label="Regression Line")
plt.title("Linear Regression on Synthetic Data")
plt.xlabel("Feature")
plt.ylabel("Target")
plt.legend()
plt.show()


Q8. Write a Python script that pickles a trained linear regression model and saves it to a file.

In [None]:
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)

model = LinearRegression()
model.fit(X, y)

with open("linear_regression_model.pkl", "wb") as file:
    pickle.dump(model, file)


Q9. Write a Python script that fits a polynomial regression model (degree 2) to a dataset and plots the
regression curve.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

model = LinearRegression()
model.fit(X_poly, y)

X_new = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
X_new_poly = poly.transform(X_new)
y_pred = model.predict(X_new_poly)

plt.figure(figsize=(8, 5))
plt.scatter(X, y, alpha=0.7, label="Data Points")
plt.plot(X_new, y_pred, color='red', label="Polynomial Regression Line (Degree 2)")
plt.title("Polynomial Regression (Degree 2)")
plt.xlabel("Feature")
plt.ylabel("Target")
plt.legend()
plt.show()


Q10. Generate synthetic data for simple linear regression (use random values for X and y) and fit a linear
regression model to the data. Print the model's coefficient and intercept.

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

X = np.random.rand(100, 1) * 10
y = 2 * X + 5 + np.random.randn(100, 1) * 2

model = LinearRegression()
model.fit(X, y)

print(f"Coefficient: {model.coef_[0][0]:.2f}")
print(f"Intercept: {model.intercept_[0]:.2f}")


Q11. Write a Python script that fits polynomial regression models of different degrees to a synthetic dataset and
compares their performance.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

X = np.random.rand(100, 1) * 10
y = 2 * X**3 - 5 * X**2 + 3 * X + np.random.randn(100, 1) * 30

degrees = [1, 2, 3, 4]
mse_scores = []

plt.figure(figsize=(8, 5))

for degree in degrees:
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)

    model = LinearRegression()
    model.fit(X_poly, y)
    y_pred = model.predict(X_poly)
    
    mse = mean_squared_error(y, y_pred)
    mse_scores.append(mse)

    X_new = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
    X_new_poly = poly.transform(X_new)
    y_new_pred = model.predict(X_new_poly)
    
    plt.plot(X_new, y_new_pred, label=f'Degree {degree} (MSE: {mse:.2f})')

plt.scatter(X, y, color='black', alpha=0.7, label='Data Points')
plt.title("Polynomial Regression of Different Degrees")
plt.xlabel("Feature")
plt.ylabel("Target")
plt.legend()
plt.show()

for i, degree in enumerate(degrees):
    print(f"Degree {degree} - MSE: {mse_scores[i]:.2f}")


Q12. Write a Python script that fits a simple linear regression model with two features and prints the model's
coefficients, intercept, and R-squared score.

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Generate synthetic data
X = np.random.rand(100, 2)  # 100 samples, 2 features
y = X[:, 0] * 5 + X[:, 1] * 3 + np.random.randn(100)  # Linear relationship with noise

# Fit model
model = LinearRegression()
model.fit(X, y)

# Print model coefficients, intercept, and R-squared score
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("R-squared score:", r2_score(y, model.predict(X)))


13. Write a Python script that generates synthetic data, fits a linear regression model, and visualizes the
regression line along with the data points.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 2 * X + 5 + np.random.randn(100, 1)

# Fit model
model = LinearRegression()
model.fit(X, y)

# Visualize data and regression line
plt.scatter(X, y, color='blue')
plt.plot(X, model.predict(X), color='red')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression')
plt.show()


14. Write a Python script that uses the Variance Inflation Factor (VIF) to check for multicollinearity in a dataset
with multiple features.

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Generate synthetic data
X = np.random.rand(100, 3)
X = pd.DataFrame(X, columns=['Feature1', 'Feature2', 'Feature3'])

# Add constant for VIF calculation
X_const = add_constant(X)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]

print(vif_data)


15. Write a Python script that generates synthetic data for a polynomial relationship (degree 4), fits a
polynomial regression model, and plots the regression curve.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Generate synthetic data for a polynomial relationship
X = np.random.rand(100, 1) * 10
y = X**4 - 2 * X**3 + 5 * X**2 + 3 * X + np.random.randn(100, 1)

# Polynomial features (degree 4)
poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)

# Fit model
model = LinearRegression()
model.fit(X_poly, y)

# Visualize data and polynomial regression curve
plt.scatter(X, y, color='blue')
plt.plot(np.sort(X, axis=0), model.predict(poly.transform(np.sort(X, axis=0))), color='red')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Polynomial Regression (Degree 4)')
plt.show()


16. Write a Python script that creates a machine learning pipeline with data standardization and a multiple
linear regression model, and prints the R-squared score.

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Generate synthetic data
X, y = make_regression(n_samples=100, n_features=5, noise=0.1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipeline = make_pipeline(StandardScaler(), LinearRegression())
pipeline.fit(X_train, y_train)

# Print R-squared score
print("R-squared score:", pipeline.score(X_test, y_test))


17. Write a Python script that performs polynomial regression (degree 3) on a synthetic dataset and plots the
regression curve.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 3 * X**3 - 2 * X**2 + np.random.randn(100, 1)

# Polynomial features (degree 3)
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)

# Fit model
model = LinearRegression()
model.fit(X_poly, y)

# Visualize data and polynomial regression curve
plt.scatter(X, y, color='blue')
plt.plot(np.sort(X, axis=0), model.predict(poly.transform(np.sort(X, axis=0))), color='red')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Polynomial Regression (Degree 3)')
plt.show()


18. Write a Python script that performs multiple linear regression on a synthetic dataset with 5 features. Print
the R-squared score and model coefficients.

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Generate synthetic data
X = np.random.rand(100, 5)  # 100 samples, 5 features
y = 3 * X[:, 0] + 2 * X[:, 1] - X[:, 2] + 4 * X[:, 3] - 2 * X[:, 4] + np.random.randn(100)

# Fit model
model = LinearRegression()
model.fit(X, y)

# Print R-squared score and coefficients
print("R-squared score:", r2_score(y, model.predict(X)))
print("Coefficients:", model.coef_)


19. Write a Python script that generates synthetic data for linear regression, fits a model, and visualizes the
data points along with the regression line.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 5 * X + 3 + np.random.randn(100, 1)

# Fit model
model = LinearRegression()
model.fit(X, y)

# Visualize data and regression line
plt.scatter(X, y, color='blue')
plt.plot(X, model.predict(X), color='red')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression')
plt.show()


20. Create a synthetic dataset with 3 features and perform multiple linear regression. Print the model's Rsquared score and coefficients.

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Generate synthetic data
X = np.random.rand(100, 3)  # 100 samples, 3 features
y = 3 * X[:, 0] + 2 * X[:, 1] - X[:, 2] + np.random.randn(100)

# Fit model
model = LinearRegression()
model.fit(X, y)

# Print R-squared score and coefficients
print("R-squared score:", r2_score(y, model.predict(X)))
print("Coefficients:", model.coef_)


21. Write a Python script that demonstrates how to serialize and deserialize machine learning models using
joblib instead of pickling.

In [None]:
import joblib
from sklearn.linear_model import LinearRegression

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 3 * X + 5 + np.random.randn(100, 1)

# Fit model
model = LinearRegression()
model.fit(X, y)

# Save the model using joblib
joblib.dump(model, 'linear_model.pkl')

# Load the model
loaded_model = joblib.load('linear_model.pkl')

# Make predictions with the loaded model
print("Predictions:", loaded_model.predict(X[:5]))


22. Write a Python script to perform linear regression with categorical features using one-hot encoding. Use
the Seaborn 'tips' dataset.

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load dataset
tips = sns.load_dataset('tips')

# One-hot encode categorical features
tips_encoded = pd.get_dummies(tips, drop_first=True)

# Features and target
X = tips_encoded.drop('tip', axis=1)
y = tips_encoded['tip']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model = LinearRegression()
model.fit(X_train, y_train)

# Print R-squared score
print("R-squared score:", model.score(X_test, y_test))


23. Compare Ridge Regression with Linear Regression on a synthetic dataset and print the coefficients and Rsquared score.

In [None]:
import numpy as np
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import r2_score

# Generate synthetic data
X = np.random.rand(100, 3)
y = 3 * X[:, 0] + 2 * X[:, 1] - X[:, 2] + np.random.randn(100)

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
lin_reg_pred = lin_reg.predict(X)

# Ridge Regression
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X, y)
ridge_reg_pred = ridge_reg.predict(X)

# Compare coefficients and R-squared score
print("Linear Regression coefficients:", lin_reg.coef_)
print("Ridge Regression coefficients:", ridge_reg.coef_)
print("Linear Regression R-squared:", r2_score(y, lin_reg_pred))
print("Ridge Regression R-squared:", r2_score(y, ridge_reg_pred))


24. Write a Python script that uses cross-validation to evaluate a Linear Regression model on a synthetic
dataset.

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Generate synthetic data
X = np.random.rand(100, 3)
y = 3 * X[:, 0] + 2 * X[:, 1] - X[:, 2] + np.random.randn(100)

# Create model
model = LinearRegression()

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)

# Print cross-validation results
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


25. Write a Python script that compares polynomial regression models of different degrees and prints the Rsquared score for each

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Generate synthetic data
X = np.random.rand(100, 1) * 10
y = 2 * X**3 + 3 * X**2 - 4 * X + np.random.randn(100, 1)

# Try polynomial degrees from 1 to 5
for degree in range(1, 6):
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)
    
    model = LinearRegression()
    model.fit(X_poly, y)
    
    # Plot results
    plt.figure(figsize=(6, 4))
    plt.scatter(X, y, color='blue')
    plt.plot(np.sort(X, axis=0), model.predict(poly.transform(np.sort(X, axis=0))), color='red')
    plt.title(f'Polynomial Regression (Degree {degree})')
    plt.xlabel('X')
    plt.ylabel('y')
    plt.show()
    
    # Print R-squared score
    print(f"Degree {degree} R-squared:", r2_score(y, model.predict(X_poly)))
S