# Electric Vehicle Sales by State in India
Machine Learning Project (3-Year Experience Level)

Analyze and predict the sales of Electric Vehicles (EV) by state in India.

In [None]:
# Step 1: Data Collection
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv(r"C:\Users\MY HP\Downloads\Electric Vehicle Sales by State in India.csv")

# Display first few rows
df.head()

In [None]:
# Step 2: Data Preprocessing
# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Check for missing values
print(df.isnull().sum())

# Fill missing numerical and categorical values
df['EV_Sales_Quantity'].fillna(df['EV_Sales_Quantity'].median(), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

df.head()

In [None]:
# Step 3: Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt
import seaborn as sns

# EV sales over the years by state
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Year', y='EV_Sales_Quantity', hue='State')
plt.title('EV Sales by State over the Years')
plt.show()

# Sales by vehicle category
plt.figure(figsize=(10, 6))
sns.barplot(x='Vehicle_Category', y='EV_Sales_Quantity', data=df, ci=None)
plt.title('EV Sales by Vehicle Category')
plt.show()

In [None]:
# Step 4: Feature Engineering
# Extract Month and Day from Date
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=['State', 'Vehicle_Class', 'Vehicle_Category', 'Vehicle_Type'], drop_first=True)

# Drop unnecessary columns
df_encoded.drop(['Date', 'Month_Name'], axis=1, inplace=True)

df_encoded.head()

In [None]:
# Step 5: Modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Split data into features and target
X = df_encoded.drop('EV_Sales_Quantity', axis=1)
y = df_encoded['EV_Sales_Quantity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

In [None]:
# Step 6: Model Evaluation
# Actual vs predicted sales
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.title('Actual vs Predicted EV Sales')
plt.xlabel('Actual EV Sales')
plt.ylabel('Predicted EV Sales')
plt.show()

# Feature importance
importance = model.feature_importances_
feature_importance = pd.Series(importance, index=X_train.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
feature_importance.head(20).plot(kind='bar')
plt.title('Top 20 Feature Importance')
plt.show()

# Step 7: Conclusion
- Cleaned and preprocessed EV sales dataset.

- Extracted date features and encoded categorical variables.

- Built a Random Forest Regressor to predict EV sales.

- Evaluated the model using RMSE.

- Visualized trends, sales by state, vehicle category, and feature importance.