# Final Report - Machine Learning Project


## Step 1: Loading the Data
We start by importing the necessary libraries and loading the dataset.


In [None]:
# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load the dataset
print("Loading dataset from file...")
data_path = "final_data.csv"  # Ensure this path is correct
data = pd.read_csv(data_path)
print("Data Loaded Successfully!")

# Display dataset info
print("\nDataset Info:")
data.info()
print("\nFirst 5 Rows:")
print(data.head())

##  Step 2: Exploratory Data Analysis (EDA)
Before we train our models, we analyze the dataset to check for:
- **Missing values**
- **Duplicates**
- **Outliers**
- **Feature correlations**

In [None]:
# Checking for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Checking for duplicates
print("\nDuplicate Rows:", data.duplicated().sum())

# Checking summary statistics
print("\nSummary Statistics:")
print(data.describe())

# Checking correlation between features
plt.figure(figsize=(12, 6))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()


##  Step 3: Data Preprocessing
### Handling missing values, duplicates, and encoding categorical variables.


In [None]:
# Drop duplicates if any
if data.duplicated().sum() > 0:
    data.drop_duplicates(inplace=True)
    print("\nNo duplicates to remove!")

# Encoding categorical features
print("\nEncoding categorical variables...")
data = pd.get_dummies(data, drop_first=True)

# Save cleaned dataset
cleaned_data_path = "cleaned_final_data.csv"
data.to_csv(cleaned_data_path, index=False)
print(f"Cleaned data saved at: {cleaned_data_path}")


##  Step 4: Model Training
We train:
- **Regression Models** (Linear, Ridge, Lasso)
- **Classification Models** (Logistic Regression, Decision Trees, Random Forest)


In [None]:
# Import ML libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score, classification_report

# Define features and target variable
target = "price"  # Modify based on dataset
X = data.drop(columns=[target])
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Model Evaluation
print("\nModel Evaluation:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


##  Step 5: Model Deployment
We save the best-trained models for future use.


In [None]:
# Save cleaned dataset
joblib.dump(data, "final_cleaned_data.pkl")
print("Cleaned dataset saved as .pkl file")

In [None]:
# Save trained model
best_model_path = "best_model.pkl"
joblib.dump(model, best_model_path)
print(f"Best model saved successfully at: {best_model_path}")