<a href="https://colab.research.google.com/github/Rafiquekuwari/machine_learning_tasks/blob/main/Machine_Learning_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing
from scipy.stats import skew

In [None]:
# Load Dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target
print(df.head())


In [None]:
# Data Exploration
print(df.info())
print(df.describe())

In [None]:
print(df.isna().any().sum())  # This should print the total count of columns with missing values


In [None]:
# Handling Missing Values
print("Before Handling Missing Values:")
print(df.isnull().sum())
df_before_missing = df.copy()
imputer = SimpleImputer(strategy='median')
df[df.columns] = imputer.fit_transform(df)
print("After Handling Missing Values:")
print(df.isnull().sum())
print("Missing values handled.")

In [None]:
# Display before and after summary
print("Summary Before Handling Missing Values:")
print(df_before_missing.describe())
print("Summary After Handling Missing Values:")
print(df.describe())


In [None]:
# Handling Outliers using IQR Method
print("Before Removing Outliers:")
print(df.describe())
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
print("After Removing Outliers:")
print(df.describe())
print("Outliers removed using IQR method.")

In [None]:
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()
print("Correlation analysis completed.")

In [None]:
# Feature Selection: Removing highly correlated features
print("Before Removing Highly Correlated Features:")
print(df.columns)
threshold = 0.9
correlated_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            correlated_features.add(corr_matrix.columns[i])
df = df.drop(columns=correlated_features)
print("After Removing Highly Correlated Features:")
print(df.columns)
print("Highly correlated features removed.")


In [None]:
print("Before Skewness Handling:")
skewed_features = df.drop(columns=['Target']).apply(lambda x: skew(x)).sort_values(ascending=False)
print(skewed_features)
for feature in skewed_features.index:
    if abs(skewed_features[feature]) > 0.75:
        df[feature] = np.log1p(df[feature])
print("After Skewness Handling:")
skewed_features = df.drop(columns=['Target']).apply(lambda x: skew(x)).sort_values(ascending=False)
print(skewed_features)

In [None]:
print("Features considered for log transformation:")
print(skewed_features[skewed_features.abs() > 0.75])


In [None]:
# Train-Test Split
X = df.drop(columns=['Target'])
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train-test split completed.")

In [None]:
# Data Scaling
print("Before Scaling:")
print(pd.DataFrame(X_train).describe())
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("After Scaling:")
print(pd.DataFrame(X_train_scaled).describe())

print("Data scaling completed.")


In [None]:
# Feature Engineering: Adding Polynomial Features
print("Before Polynomial Feature Addition:")
print(X_train.shape)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)
print("After Polynomial Feature Addition:")
print(X_train_poly.shape)
print("Polynomial features added.")

In [None]:
# Model Training & Evaluation
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    end_time = time.time()

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    elapsed_time = end_time - start_time

    return mae, rmse, r2, elapsed_time, y_pred

In [None]:

model = LinearRegression()

# Train and evaluate the model
mae, rmse, r2, elapsed_time, y_pred = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
cross_val = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

# Store results
results = {
    'MAE': round(mae, 6),
    'RMSE': round(float(rmse), 6),
    'R² Score': round(r2, 6),
    'Cross-Val R²': round(float(cross_val), 6),
    'Time (s)': round(elapsed_time, 6)
}
predictions = y_pred

# Print or log results
print(f"Multiple Linear Regression")
print(results)


In [None]:
print(X_train_scaled.shape)


In [None]:
models = {
    'Simple Linear Regression': LinearRegression(),
    'Multiple Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Polynomial Regression': LinearRegression(),
    'SVR': SVR(kernel='rbf'),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
}

In [None]:
results = {}
predictions = {}
cross_val_results = {}

X_train_simple = X_train_scaled[:, 0].reshape(-1, 1)  # Select only the first feature
X_test_simple = X_test_scaled[:, 0].reshape(-1, 1)

for name, model in models.items():
    if name == 'Polynomial Regression':
        mae, rmse, r2, elapsed_time, y_pred = evaluate_model(model, X_train_poly, X_test_poly, y_train, y_test)
        cross_val = cross_val_score(model, X_train_poly, y_train, cv=5, scoring='r2').mean()
    elif name == 'Simple Linear Regression':
        mae, rmse, r2, elapsed_time, y_pred = evaluate_model(model, X_train_simple, X_test_simple, y_train, y_test)
        cross_val = cross_val_score(model, X_train_simple, y_train, cv=5, scoring='r2').mean()
    else:
        mae, rmse, r2, elapsed_time, y_pred = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
        cross_val = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

    results[name] = {
        'MAE': mae,
        'RMSE': rmse,
        'R² Score': r2,
        'Cross-Val R²': cross_val,
        'Time (s)': elapsed_time
    }
    predictions[name] = y_pred


In [None]:
# Convert results to DataFrame & Display
results_df = pd.DataFrame(results).T
print(results_df)



In [None]:
# Visualization
plt.figure(figsize=(10, 5))
sns.barplot(data=results_df, x=results_df.index, y='R² Score')
plt.xticks(rotation=45)
plt.title("Model Performance Comparison")
plt.show()
print("Visualization completed.")


In [None]:
import statsmodels.api as sm

for name, y_pred in predictions.items():
    residuals = y_test - y_pred
    sm.qqplot(residuals, line='45')
    plt.title(f"Residual Plot for {name}")
    plt.show()

In [None]:
# Conclusion
best_model = results_df['R² Score'].idxmax()
print(f"The best performing model is: {best_model} with an R² Score of {results_df.loc[best_model, 'R² Score']:.4f}")


The best performing model is: Random Forest with an R² Score of 0.6961


In [None]:
!git push origin main


Everything up-to-date


In [None]:
!cp /content/drive/MyDrive/Colab_Notebooks/Machine_Learning_Task_2.ipynb /content/


In [None]:
!ls -lh /content/


total 28K
drwx------ 6 root root 4.0K Mar 23 12:24 drive
-rw------- 1 root root  14K Mar 23 12:43 Machine_Learning_Task_2.ipynb
-rw-r--r-- 1 root root  200 Mar 23 12:04 README.md
drwxr-xr-x 1 root root 4.0K Mar 20 13:31 sample_data


In [None]:
!git add Machine_Learning_Task_2.ipynb
!git commit -m "Added Machine Learning Task 2 notebook"
!git push origin main


[main 01196ff] Added Machine Learning Task 2 notebook
 1 file changed, 1 insertion(+), 1 deletion(-)
 rewrite Machine_Learning_Task_2.ipynb (97%)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 680 bytes | 680.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/MohammedRafiqueKuwari/machine_Learning_Tasks.git
   2db1cb6..01196ff  main -> main


In [None]:
import json

with open("Machine_Learning_Task_2.ipynb", "r") as f:
    json.load(f)  # This will throw an error if the JSON is invalid
