In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
#dataset1
pcs = pd.read_csv("PCS_data.csv")

pcs.head()


In [None]:
#dataset2
ev = pd.read_csv("EV_India.csv")

ev.head()

In [None]:
PCS_EV_data = pd.merge(ev, pcs, on='States/UTs', how='outer')

PCS_EV_data.to_csv('merged(EV&PCS)_dataset.csv', index=False)

#combined data is for geagraphical analysis
ge_data = PCS_EV_data.copy()

ge_data

In [None]:
pe_data = pd.read_csv("ev_models.csv")

pe_data.head()

In [None]:
#loading the behavior/demographic dataset
de_data = pd.read_csv("Indian automoble buying behavour study.csv")

de_data.head(5)

In [None]:
ge_data.describe()


In [None]:
de_data.describe()

In [None]:
# Check for missing values
print(ge_data.isnull().sum())

print(de_data.isnull().sum())


Geographic distributions

In [None]:
ge_sorted = ge_data.sort_values(by='Total Electric Vehicle', ascending=False)

In [None]:
plt.figure(figsize=(14, 7))

# Number of EVs by State
plt.subplot(2, 1, 1)
sns.barplot(x='States/UTs', y='Total Electric Vehicle', data= ge_sorted)
plt.xticks(rotation=90)
plt.title('Number of Electric Vehicles by State')


#maharashtra and Karnataka  have most no. of EVs with Delhi being a cose third.

In [None]:
plt.figure(figsize=(14, 7))

# Distribution of EV Chargers
plt.subplot(2, 1, 1)
sns.histplot(ge_data['Number of EV Chargers'], bins=20, kde=True)
plt.title('Distribution of EV Chargers')

# Distribution of Charging Points
plt.subplot(2, 1, 2)
sns.histplot(ge_data['Number of Charging Points/Vehicle Connectors'], bins=20, kde=True)
plt.title('Distribution of Charging Points')

plt.tight_layout()
plt.show()

In [None]:
# Assuming the merged dataset is named merged_df
plt.figure(figsize=(12, 8))
sns.pointplot(data=ge_data, y='States/UTs', x='No. of Operational Public Charging Station (PCS)', color='orange')
plt.xlabel('Number of Operational Public Charging Stations (PCS)', family='serif', size=12, labelpad=10)
plt.ylabel('State/UT', family='serif', size=12)
plt.tick_params(direction='inout')
plt.xticks(family='serif', size=10)
plt.yticks(family='serif', size=10)
plt.title(label='Available Public Charging Stations for EVs in India', weight=200, family='serif', size=15, pad=12)
plt.show()

Demographic distributions

In [None]:
plt.figure(figsize=(14, 7))

# Distribution of Age
plt.subplot(2, 1, 1)
sns.histplot(de_data['Age'], bins=20, kde=True)
plt.title('Distribution of Age')

# Distribution of Salary
plt.subplot(2, 1, 2)
sns.histplot(de_data['Salary'], bins=20, kde=True)
plt.title('Distribution of Salary')

plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Salary', data=de_data)
plt.title('Age vs Salary')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.show()


In [None]:

car_make_counts = de_data['Make'].value_counts()

plt.figure(figsize=(3, 3))  # Adjust the figure size as needed
car_make_counts.plot.pie(radius=2, cmap='viridis', startangle=0, textprops=dict(family='serif'))
plt.pie(x=[1], radius=1.2, colors='white')
plt.title(label='Distribution of Cars', family='serif', size=10, pad=100)
plt.ylabel('')
plt.show()

EV distributions


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Range_Km', y='PriceEuro', hue='Brand', data=pe_data)
plt.title('Range vs Price of EVs')
plt.xlabel('Range (Km)')
plt.ylabel('Price (Euro)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

Segmentation

In [None]:
numeric_features = ['Age', 'No of Dependents', 'Salary', 'Wife Salary', 'Total Salary', 'Price']
categorical_features = ['Profession', 'Marrital Status', 'Education', 'Personal loan', 'House Loan', 'Wife Working']


X_numeric = de_data[numeric_features]
X_categorical = pd.get_dummies(de_data[categorical_features])

X = pd.concat([X_numeric, X_categorical], axis=1)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X_pca)
    wcss.append(kmeans.inertia_)


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Choose the optimal number of clusters (let's say it's 4, but adjust based on the elbow curve)
n_clusters = 3

# Perform K-means clustering
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
cluster_labels = kmeans.fit_predict(X_pca)


In [None]:
de_data['Cluster'] = cluster_labels

# Visualize the clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
plt.title('Customer Segments')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar(scatter)
plt.show()

In [None]:
# Calculate silhouette score
silhouette_avg = silhouette_score(X_pca, cluster_labels)
print(f"The average silhouette score is: {silhouette_avg}")

# Visualize the distribution of car makes in each cluster
plt.figure(figsize=(12, 6))
sns.countplot(x='Cluster', hue='Make', data=de_data)
plt.title('Distribution of Car Makes in Each Cluster')
plt.show()