In [None]:
from google.colab import files
import pandas as pd

# Use files.upload() to load the dataset
uploaded = files.upload()

# Assuming the file is named 'EV_cars.csv'
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

print("Dataset loaded successfully!")
df.head()

In [None]:
print("="*80)
print("Investigating Price Outliers")
print("="*80)

display(outliers)

In [None]:

!pip install pandas numpy matplotlib seaborn scikit-learn


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')



In [None]:
import matplotlib
matplotlib.use('Agg') # Set the backend to Agg
import matplotlib.pyplot as plt
import seaborn as sns

print("="*80)
print("STEP 3.1: ANALYZING PRICE DISTRIBUTION")
print("="*80)


print("\nüìä Price Statistics:")
print(f"Mean Price:     ‚Ç¨{df['Price.DE.'].mean():,.2f}")
print(f"Median Price:   ‚Ç¨{df['Price.DE.'].median():,.2f}")
print(f"Std Deviation:  ‚Ç¨{df['Price.DE.'].std():,.2f}")
print(f"Min Price:      ‚Ç¨{df['Price.DE.'].min():,.2f}")
print(f"Max Price:      ‚Ç¨{df['Price.DE.'].max():,.2f}")
print(f"Price Range:    ‚Ç¨{df['Price.DE.'].max() - df['Price.DE.'].min():,.2f}")


print("\nüìà Price Quartiles:")
print(f"25th Percentile (Q1): ‚Ç¨{df['Price.DE.'].quantile(0.25):,.2f}")
print(f"50th Percentile (Q2): ‚Ç¨{df['Price.DE.'].quantile(0.50):,.2f}  [Median]")
print(f"75th Percentile (Q3): ‚Ç¨{df['Price.DE.'].quantile(0.75):,.2f}")


plt.figure(figsize=(12, 6)) # Reduced figure size


plt.subplot(1, 3, 1)
sns.histplot(df['Price.DE.'], bins=30, kde=True, color='skyblue', edgecolor='black')
plt.axvline(df['Price.DE.'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: ‚Ç¨{df["Price.DE."].mean():,.0f}')
plt.axvline(df['Price.DE.'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: ‚Ç¨{df["Price.DE."].median():,.0f}')
plt.xlabel('Price (EUR)', fontsize=12, fontweight='bold')
plt.ylabel('Frequency', fontsize=12, fontweight='bold')
plt.title('Distribution of EV Prices\n(Histogram)', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
box = plt.boxplot(df['Price.DE.'], vert=True, patch_artist=True,
                   boxprops=dict(facecolor='lightblue', color='blue'),
                   medianprops=dict(color='red', linewidth=2),
                   whiskerprops=dict(color='blue', linewidth=1.5),
                   capprops=dict(color='blue', linewidth=1.5))
plt.ylabel('Price (EUR)', fontsize=12, fontweight='bold')
plt.title('Price Distribution\n(Boxplot - Shows Outliers)', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')

# Removed text annotations for now
# median = df['Price.DE.'].median()
# q1 = df['Price.DE.'].quantile(0.25)
# q3 = df['Price.DE.'].quantile(0.75)
# plt.text(1.15, median, f'Median\n‚Ç¨{median:,.0f}', fontsize=10, va='center')
# plt.text(1.15, q1, f'Q1\n‚Ç¨{q1:,.0f}', fontsize=9, va='center')
# plt.text(1.15, q3, f'Q3\n‚Ç¨{q3:,.0f}', fontsize=9, va='center')


plt.subplot(1, 3, 3)
sns.violinplot(y=df['Price.DE.'], color='lightgreen')
plt.ylabel('Price (EUR)', fontsize=12, fontweight='bold')
plt.title('Price Distribution\n(Violin Plot)', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


Q1 = df['Price.DE.'].quantile(0.25)
Q3 = df['Price.DE.'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Price.DE.'] < lower_bound) | (df['Price.DE.'] > upper_bound)]
print(f"\n‚ö†Ô∏è  Number of Price Outliers: {len(outliers)}")
print(f"   Lower Bound: ‚Ç¨{lower_bound:,.2f}")
print(f"   Upper Bound: ‚Ç¨{upper_bound:,.2f}")

if len(outliers) > 0:
    print("\nüîç Top 5 Most Expensive Outliers:")
    print(outliers.nlargest(5, 'Price.DE.')[['Car_name', 'Price.DE.', 'Battery', 'Range']])

In [None]:
print("\n" + "="*80)
print("STEP 3.2: ANALYZING ALL FEATURE DISTRIBUTIONS")
print("="*80)

numerical_features = ['Battery', 'Efficiency', 'Fast_charge', 'Range', 'Top_speed', 'acceleration..0.100.']


fig, axes = plt.subplots(3, 2, figsize=(16, 14))
axes = axes.flatten()

for idx, feature in enumerate(numerical_features):

    sns.histplot(df[feature], bins=25, kde=True, ax=axes[idx], color='coral', edgecolor='black')


    axes[idx].axvline(df[feature].mean(), color='red', linestyle='--',
                      linewidth=2, label=f'Mean: {df[feature].mean():.1f}')
    axes[idx].axvline(df[feature].median(), color='green', linestyle='--',
                      linewidth=2, label=f'Median: {df[feature].median():.1f}')

    axes[idx].set_xlabel(feature, fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[idx].set_title(f'Distribution of {feature}', fontsize=12, fontweight='bold')
    axes[idx].legend(fontsize=9)
    axes[idx].grid(True, alpha=0.3)


    print(f"\nüìä {feature} Statistics:")
    print(f"   Mean:   {df[feature].mean():.2f}")
    print(f"   Median: {df[feature].median():.2f}")
    print(f"   Std:    {df[feature].std():.2f}")
    print(f"   Min:    {df[feature].min():.2f}")
    print(f"   Max:    {df[feature].max():.2f}")

plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*80)
print("STEP 3.3: CORRELATION ANALYSIS")
print("="*80)

numerical_cols = ['Battery', 'Efficiency', 'Fast_charge', 'Price.DE.',
                  'Range', 'Top_speed', 'acceleration..0.100.']


correlation_matrix = df[numerical_cols].corr()


price_correlation = correlation_matrix['Price.DE.'].sort_values(ascending=False)
print("\nüìà Correlation with Price (sorted):")
print(price_correlation)

print("\nüí° Interpretation Guide:")
print("   0.7 to 1.0   : Strong positive correlation")
print("   0.3 to 0.7   : Moderate positive correlation")
print("   0.0 to 0.3   : Weak positive correlation")
print("  -0.3 to 0.0   : Weak negative correlation")
print("  -0.7 to -0.3  : Moderate negative correlation")
print("  -1.0 to -0.7  : Strong negative correlation")


plt.figure(figsize=(14, 10))


sns.heatmap(correlation_matrix,
            annot=True,
            fmt='.3f',
            cmap='RdYlGn',
            center=0,
            square=True,
            linewidths=2,
            linecolor='white',
            cbar_kws={"shrink": 0.8, "label": "Correlation Coefficient"},
            vmin=-1, vmax=1)

plt.title('Correlation Matrix - EV Features\n(How features relate to each other)',
          fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


print("\nüéØ TOP 5 Features Most Correlated with Price:")
top_5 = price_correlation.drop('Price.DE.').head(5)
for idx, (feature, corr) in enumerate(top_5.items(), 1):
    print(f"   {idx}. {feature:20s} : {corr:.3f}")

print("\n‚ö†Ô∏è  Features with Low Correlation (might not be useful):")
low_corr = price_correlation[abs(price_correlation) < 0.3].drop('Price.DE.', errors='ignore')
if len(low_corr) > 0:
    for feature, corr in low_corr.items():
        print(f"   - {feature:20s} : {corr:.3f}")
else:
    print("   All features have moderate to strong correlation!")

In [None]:
# ============================================================================
# 4. SCATTER PLOTS - FEATURES VS PRICE
# ============================================================================

print("\n" + "="*80)
print("STEP 3.4: SCATTER PLOT ANALYSIS")
print("="*80)

features_to_plot = ['Battery', 'Range', 'Top_speed', 'Efficiency',
                    'acceleration..0.100.', 'Fast_charge']

# Remove outliers from 'Price.DE.' for plotting the trend line
Q1 = df['Price.DE.'].quantile(0.25)
Q3 = df['Price.DE.'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_cleaned = df[(df['Price.DE.'] >= lower_bound) & (df['Price.DE.'] <= upper_bound)].copy()

print("\nVariance of features in cleaned data (for trend line calculation):")
for feature in features_to_plot:
    print(f"  {feature}: {df_cleaned[feature].var():.2f}")


fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, feature in enumerate(features_to_plot):
    # Use seaborn.regplot which plots scatter points and a regression line
    sns.regplot(x=feature, y='Price.DE.', data=df_cleaned, ax=axes[idx],
                scatter_kws={'alpha':0.6, 's':50, 'edgecolors':'black'}, # Removed 'linewidths'
                line_kws={'color':'red', 'alpha':0.8, 'linewidth':2.5})


    axes[idx].set_xlabel(feature, fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Price (EUR)', fontsize=12, fontweight='bold')
    axes[idx].set_title(f'{feature} vs Price (with trend line)', fontsize=13, fontweight='bold')

    axes[idx].grid(True, alpha=0.3)

    # Calculate and display correlation using original data
    corr = df[[feature, 'Price.DE.']].corr().iloc[0, 1]
    axes[idx].text(0.05, 0.95, f'Correlation: {corr:.3f}',
                   transform=axes[idx].transAxes,
                   fontsize=10,
                   verticalalignment='top',
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

# Analyze relationship patterns using original data
print("\nüîç Relationship Pattern Analysis:")
for feature in features_to_plot:
    corr = df[[feature, 'Price.DE.']].corr().iloc[0, 1]

    if abs(corr) > 0.7:
        strength = "STRONG"
    elif abs(corr) > 0.4:
        strength = "MODERATE"
    else:
        strength = "WEAK"

    direction = "POSITIVE" if corr > 0 else "NEGATIVE"

    print(f"\n{feature}:")
    print(f"   Correlation: {corr:.3f}")
    print(f"   Strength: {strength}")
    print(f"   Direction: {direction}")

    if direction == "POSITIVE":
        print(f"   Meaning: As {feature} increases, Price tends to increase")
    else:
        print(f"   Meaning: As {feature} increases, Price tends to decrease")

In [None]:
# ============================================================================
# 5. TOP AND BOTTOM EVs ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("STEP 3.5: TOP & BOTTOM EVs ANALYSIS")
print("="*80)

# Top 10 most expensive EVs
print("\nüí∞ TOP 10 MOST EXPENSIVE EVs:")
print("="*80)
top_10 = df.nlargest(10, 'Price.DE.')[['Car_name', 'Price.DE.', 'Battery',
                                         'Range', 'Top_speed', 'acceleration..0.100.']]
top_10_display = top_10.copy()
top_10_display['Price.DE.'] = top_10_display['Price.DE.'].apply(lambda x: f'‚Ç¨{x:,.0f}')
print(top_10_display.to_string(index=False))

# Visualize top 10 expensive
plt.figure(figsize=(14, 6))
top_10_plot = df.nlargest(10, 'Price.DE.')
plt.barh(range(len(top_10_plot)), top_10_plot['Price.DE.'], color='gold', edgecolor='black')
plt.yticks(range(len(top_10_plot)), top_10_plot['Car_name'], fontsize=10)
plt.xlabel('Price (EUR)', fontsize=12, fontweight='bold')
plt.title('Top 10 Most Expensive EVs', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()  # Highest at top
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

# Top 10 least expensive EVs
print("\nüíµ TOP 10 LEAST EXPENSIVE EVs:")
print("="*80)
bottom_10 = df.nsmallest(10, 'Price.DE.')[['Car_name', 'Price.DE.', 'Battery',
                                            'Range', 'Top_speed', 'acceleration..0.100.']]
bottom_10_display = bottom_10.copy()
bottom_10_display['Price.DE.'] = bottom_10_display['Price.DE.'].apply(lambda x: f'‚Ç¨{x:,.0f}')
print(bottom_10_display.to_string(index=False))

# Visualize bottom 10
plt.figure(figsize=(14, 6))
bottom_10_plot = df.nsmallest(10, 'Price.DE.')
plt.barh(range(len(bottom_10_plot)), bottom_10_plot['Price.DE.'], color='lightblue', edgecolor='black')
plt.yticks(range(len(bottom_10_plot)), bottom_10_plot['Car_name'], fontsize=10)
plt.xlabel('Price (EUR)', fontsize=12, fontweight='bold')
plt.title('Top 10 Least Expensive EVs', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

# Compare characteristics
print("\nüìä COMPARISON: Expensive vs Cheap EVs")
print("="*80)
expensive_avg = df.nlargest(10, 'Price.DE.')[['Battery', 'Range', 'Top_speed', 'acceleration..0.100.']].mean()
cheap_avg = df.nsmallest(10, 'Price.DE.')[['Battery', 'Range', 'Top_speed', 'acceleration..0.100.']].mean()

comparison = pd.DataFrame({
    'Feature': ['Battery (kWh)', 'Range (km)', 'Top Speed (km/h)', 'Acceleration (0-100)'],
    'Expensive EVs (Avg)': [expensive_avg['Battery'], expensive_avg['Range'],
                             expensive_avg['Top_speed'], expensive_avg['acceleration..0.100.']],
    'Cheap EVs (Avg)': [cheap_avg['Battery'], cheap_avg['Range'],
                        cheap_avg['Top_speed'], cheap_avg['acceleration..0.100.']]
})

comparison['Difference'] = comparison['Expensive EVs (Avg)'] - comparison['Cheap EVs (Avg)']
print(comparison.to_string(index=False))

In [None]:
# ============================================================================
# 6. CATEGORY-BASED ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("STEP 3.6: CATEGORY-BASED ANALYSIS")
print("="*80)

# Create battery capacity categories
df['Battery_Category'] = pd.cut(df['Battery'],
                                 bins=[0, 50, 70, 90, 200],
                                 labels=['Small (<50)', 'Medium (50-70)',
                                        'Large (70-90)', 'XL (90+)'])

# Create range categories
df['Range_Category'] = pd.cut(df['Range'],
                               bins=[0, 300, 400, 500, 1000],
                               labels=['Short (<300)', 'Medium (300-400)',
                                      'Long (400-500)', 'XL (500+)'])

# Average price by battery category
print("\nüí° Average Price by Battery Capacity:")
battery_price = df.groupby('Battery_Category')['Price.DE.'].agg(['mean', 'count', 'min', 'max'])
battery_price['mean'] = battery_price['mean'].apply(lambda x: f'‚Ç¨{x:,.0f}')
battery_price['min'] = battery_price['min'].apply(lambda x: f'‚Ç¨{x:,.0f}')
battery_price['max'] = battery_price['max'].apply(lambda x: f'‚Ç¨{x:,.0f}')
print(battery_price)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Battery category vs price
battery_price_plot = df.groupby('Battery_Category')['Price.DE.'].mean().sort_values()
axes[0].bar(range(len(battery_price_plot)), battery_price_plot.values,
            color=['lightblue', 'skyblue', 'dodgerblue', 'darkblue'],
            edgecolor='black', linewidth=1.5)
axes[0].set_xticks(range(len(battery_price_plot)))
axes[0].set_xticklabels(battery_price_plot.index, rotation=45)
axes[0].set_ylabel('Average Price (EUR)', fontsize=12, fontweight='bold')
axes[0].set_title('Average Price by Battery Capacity', fontsize=13, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, v in enumerate(battery_price_plot.values):
    axes[0].text(i, v + 1000, f'‚Ç¨{v:,.0f}', ha='center', fontweight='bold')

# Range category vs price
range_price_plot = df.groupby('Range_Category')['Price.DE.'].mean().sort_values()
axes[1].bar(range(len(range_price_plot)), range_price_plot.values,
            color=['lightcoral', 'coral', 'orangered', 'darkred'],
            edgecolor='black', linewidth=1.5)
axes[1].set_xticks(range(len(range_price_plot)))
axes[1].set_xticklabels(range_price_plot.index, rotation=45)
axes[1].set_ylabel('Average Price (EUR)', fontsize=12, fontweight='bold')
axes[1].set_title('Average Price by Range Category', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

# Add value labels
for i, v in enumerate(range_price_plot.values):
    axes[1].text(i, v + 1000, f'‚Ç¨{v:,.0f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Box plots by category
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Battery category box plot
df.boxplot(column='Price.DE.', by='Battery_Category', ax=axes[0],
           patch_artist=True, grid=False)
axes[0].set_xlabel('Battery Category', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Price (EUR)', fontsize=12, fontweight='bold')
axes[0].set_title('Price Distribution by Battery Capacity', fontsize=13, fontweight='bold')
plt.sca(axes[0])
plt.xticks(rotation=45)

# Range category box plot
df.boxplot(column='Price.DE.', by='Range_Category', ax=axes[1],
           patch_artist=True, grid=False)
axes[1].set_xlabel('Range Category', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Price (EUR)', fontsize=12, fontweight='bold')
axes[1].set_title('Price Distribution by Range Category', fontsize=13, fontweight='bold')
plt.sca(axes[1])
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# 7. BRAND ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("STEP 3.7: BRAND ANALYSIS")
print("="*80)

# Extract brand from car name
df['Brand'] = df['Car_name'].str.split().str[0]

# Count by brand
print("\nüìä Number of Models by Brand:")
brand_counts = df['Brand'].value_counts().head(15)
print(brand_counts)

# Average price by brand
print("\nüí∞ Average Price by Brand (Top 15):")
brand_price = df.groupby('Brand')['Price.DE.'].mean().sort_values(ascending=False).head(15)
for brand, price in brand_price.items():
    print(f"   {brand:20s}: ‚Ç¨{price:,.0f}")

# Visualize
fig, axes = plt.subplots(2, 1, figsize=(14, 12))

# Top 15 brands by count
axes[0].barh(range(len(brand_counts)), brand_counts.values, color='lightgreen', edgecolor='black')
axes[0].set_yticks(range(len(brand_counts)))
axes[0].set_yticklabels(brand_counts.index)
axes[0].set_xlabel('Number of Models', fontsize=12, fontweight='bold')
axes[0].set_title('Top 15 Brands by Number of Models', fontsize=13, fontweight='bold')
axes[0].invert_yaxis()
axes[0].grid(True, alpha=0.3, axis='x')

# Top 15 brands by average price
axes[1].barh(range(len(brand_price)), brand_price.values, color='gold', edgecolor='black')
axes[1].set_yticks(range(len(brand_price)))
axes[1].set_yticklabels(brand_price.index)
axes[1].set_xlabel('Average Price (EUR)', fontsize=12, fontweight='bold')
axes[1].set_title('Top 15 Brands by Average Price', fontsize=13, fontweight='bold')
axes[1].invert_yaxis()
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

# Brand price range
print("\nüìà Price Range by Brand (Top 10):")
top_brands = df['Brand'].value_counts().head(10).index
brand_stats = df[df['Brand'].isin(top_brands)].groupby('Brand')['Price.DE.'].agg(['min', 'max', 'mean', 'count'])
brand_stats = brand_stats.sort_values('mean', ascending=False)
brand_stats['min'] = brand_stats['min'].apply(lambda x: f'‚Ç¨{x:,.0f}')
brand_stats['max'] = brand_stats['max'].apply(lambda x: f'‚Ç¨{x:,.0f}')
brand_stats['mean'] = brand_stats['mean'].apply(lambda x: f'‚Ç¨{x:,.0f}')
print(brand_stats)

In [None]:
# ============================================================================
# 8. EDA SUMMARY & KEY INSIGHTS
# ============================================================================

print("\n" + "="*80)
print("KEY INSIGHTS FROM EDA")
print("="*80)

# Calculate correlations
price_corr = df[['Battery', 'Range', 'Top_speed', 'Efficiency',
                 'acceleration..0.100.', 'Price.DE.']].corr()['Price.DE.'].sort_values(ascending=False)

print("\nüéØ MOST IMPORTANT FEATURES FOR PREDICTING PRICE:")
for idx, (feature, corr) in enumerate(price_corr.drop('Price.DE.').head(3).items(), 1):
    print(f"   {idx}. {feature:20s} (correlation: {corr:.3f})")

print("\nüìä DATA QUALITY:")
print(f"   Total EVs in dataset: {len(df)}")
print(f"   Missing values: {df.isnull().sum().sum()}")
print(f"   Number of brands: {df['Brand'].nunique()}")

print("\nüí° PRICE INSIGHTS:")
print(f"   Price range: ‚Ç¨{df['Price.DE.'].min():,.0f} - ‚Ç¨{df['Price.DE.'].max():,.0f}")
print(f"   Most common price range: ‚Ç¨{df['Price.DE.'].quantile(0.25):,.0f} - ‚Ç¨{df['Price.DE.'].quantile(0.75):,.0f}")
print(f"   Average EV price: ‚Ç¨{df['Price.DE.'].mean():,.0f}")

print("\n‚úÖ READY FOR MODEL BUILDING!")
print("="*80)

In [49]:
# ============================================================================
# COMPLETE GITHUB PREPARATION SCRIPT
# Run this entire cell in Google Colab
# ============================================================================

from google.colab import drive, files
import pickle
import os
import shutil
import zipfile
import pandas as pd # Import pandas

print("=" * 80)
print("üöÄ PREPARING PROJECT FOR GITHUB")
print("=" * 80)

# Step 1: Mount Google Drive
print("\nüìÇ Step 1: Mounting Google Drive...")
drive.mount('/content/drive')
print("‚úÖ Google Drive mounted!")

# Step 2: Create project folder structure
print("\nüìÅ Step 2: Creating project structure...")
project_folder = '/content/ev_price_prediction'
os.makedirs(project_folder, exist_ok=True)

# Create subfolders
folders = ['data', 'models', 'notebooks', 'src']
for folder in folders:
    os.makedirs(f'{project_folder}/{folder}', exist_ok=True)
    print(f"   ‚úì Created: {folder}/")

print("‚úÖ Folder structure created!")

# ============================================================================
# Step 3: Create README.md
# ============================================================================

print("\nüìù Step 3: Creating README.md...")

readme_content = """# üöó Electric Vehicle Price Prediction

A machine learning project to predict electric vehicle prices based on their specifications using Python and Scikit-learn.

![Python](https://img.shields.io/badge/Python-3.8+-blue.svg)
![License](https://img.shields.io/badge/License-MIT-green.svg)
![Status](https://img.shields.io/badge/Status-Active-success.svg)

## üìä Project Overview

This project analyzes electric vehicle data and builds machine learning models to predict EV prices in Germany based on various features such as battery capacity, range, top speed, and brand.

## üéØ Key Features

- **Comprehensive EDA**: In-depth exploratory data analysis with visualizations
- **Multiple ML Models**: Comparison of Linear Regression, Decision Tree, Random Forest, and Gradient Boosting
- **High Accuracy**: Achieved 90%+ prediction accuracy
- **Feature Engineering**: Created new features to improve model performance
- **Easy Predictions**: Simple API to predict prices for new EVs

## üìÅ Project Structure

SyntaxError: incomplete input (ipython-input-489021146.py, line 41)

In [55]:
!ls -R /content/ev_price_prediction

/content/ev_price_prediction:
src

/content/ev_price_prediction/src:
predict.py  __pycache__

/content/ev_price_prediction/src/__pycache__:
predict.cpython-312.pyc


In [54]:
import os
import sys

# Define the project directory
project_folder = '/content/ev_price_prediction'

# Create the src directory if it doesn't exist
src_path = os.path.join(project_folder, 'src')
os.makedirs(src_path, exist_ok=True)

# Create a dummy predict.py file with a basic function
predict_file_path = os.path.join(src_path, 'predict.py')
predict_file_content = """
def predict_ev_price(battery, efficiency, fast_charge, range_km, top_speed, acceleration, brand_name):
    # This is a dummy function. Replace with your actual prediction logic.
    print("Dummy prediction function called.")
    print(f"Received inputs: Battery={battery}, Efficiency={efficiency}, Fast_charge={fast_charge}, Range={range_km}, Top_speed={top_speed}, Acceleration={acceleration}, Brand={brand_name}")
    # Return a placeholder value
    return 50000.0

if __name__ == '__main__':
    # Example usage of the dummy function
    dummy_price = predict_ev_price(80, 175, 30, 500, 210, 4.5, 'Tesla')
    print(f"Dummy predicted price: ‚Ç¨{dummy_price:,.2f}")
"""

with open(predict_file_path, 'w') as f:
    f.write(predict_file_content)

print(f"Created dummy predict.py at: {predict_file_path}")

# Add the project directory to the Python path so the 'src' module can be found
if project_folder not in sys.path:
    sys.path.append(project_folder)
    print(f"Added {project_folder} to sys.path")

# Verify that the src module can now be imported
try:
    from src.predict import predict_ev_price
    print("Successfully imported predict_ev_price from src.")
except ModuleNotFoundError:
    print("Error: src module still not found after creating the directory and file.")

Created dummy predict.py at: /content/ev_price_prediction/src/predict.py
Added /content/ev_price_prediction to sys.path
Successfully imported predict_ev_price from src.


In [50]:
    git clone https://github.com/YOUR_USERNAME/ev-price-prediction.git
    cd ev-price-prediction

SyntaxError: invalid syntax (ipython-input-2873218289.py, line 1)

In [51]:
    pip install -r requirements.txt

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0m

In [52]:
from src.predict import predict_ev_price

# Example prediction
price = predict_ev_price(
    battery=80,           # Battery capacity in kWh
    efficiency=175,       # Efficiency in Wh/km
    fast_charge=30,       # Fast charge time in minutes
    range_km=500,         # Range in kilometers
    top_speed=210,        # Top speed in km/h
    acceleration=4.5,     # 0-100 km/h time in seconds
    brand_name='Tesla'    # Brand name
)

print(f"Predicted Price: ‚Ç¨{price:,.2f}")

ModuleNotFoundError: No module named 'src'

In [53]:
!git clone https://github.com/YOUR_USERNAME/ev-price-prediction.git
#!cd ev-price-prediction # Removed cd command

Cloning into 'ev-price-prediction'...
fatal: could not read Username for 'https://github.com': No such device or address


In [46]:
pip install -r requirements.txt

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0m

In [47]:
from src.predict import predict_ev_price

# Predict price for a Tesla-like EV
price = predict_ev_price(
    battery=80,           # Battery capacity in kWh
    efficiency=175,       # Efficiency in Wh/km
    fast_charge=30,       # Fast charge time in minutes
    range_km=500,         # Range in kilometers
    top_speed=210,        # Top speed in km/h
    acceleration=4.5,     # 0-100 km/h time in seconds
    brand_name='Tesla'    # Brand name
)

print(f"Predicted Price: ‚Ç¨{price:,.2f}")

ModuleNotFoundError: No module named 'src'

# Task
Load the dataset "fatihilhan/electric-vehicle-specifications-and-prices" from Kaggle and display the first 5 rows.

## Data preprocessing

### Subtask:
Prepare the data for machine learning models. This includes handling missing values, encoding categorical features, and scaling numerical features.


**Reasoning**:
Identify numerical and categorical features, handle missing values in both, encode categorical features using one-hot encoding, scale numerical features using StandardScaler, combine the features into a single DataFrame, and create the target variable Series.



In [56]:
# Identify numerical and categorical columns
# Exclude 'Price.DE.' as it's the target variable
# Exclude 'Car_name', 'Car_name_link' as they are likely not useful features and 'Brand' is already created
numerical_features = ['Battery', 'Efficiency', 'Fast_charge', 'Range', 'Top_speed', 'acceleration..0.100.']
categorical_features = ['Brand']

# Handle missing values in numerical features by imputing with the median
for feature in numerical_features:
    if df[feature].isnull().any():
        median_value = df[feature].median()
        df[feature].fillna(median_value, inplace=True)
        print(f"Imputed missing values in '{feature}' with median: {median_value:.2f}")

# Handle missing values in categorical features by imputing with the mode
for feature in categorical_features:
    if df[feature].isnull().any():
        mode_value = df[feature].mode()[0] # mode() returns a Series
        df[feature].fillna(mode_value, inplace=True)
        print(f"Imputed missing values in '{feature}' with mode: {mode_value}")


# Separate features (X) and target (y)
X = df[numerical_features + categorical_features].copy()
y = df['Price.DE.'].copy()

# Apply one-hot encoding to the categorical features
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)


# Scale the numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Display the first few rows of the processed feature DataFrame
print("\nProcessed Features (X):")
display(X.head())

# Display the first few rows of the target Series
print("\nTarget Variable (y):")
display(y.head())


Processed Features (X):


Unnamed: 0,Battery,Efficiency,Fast_charge,Range,Top_speed,acceleration..0.100.,Brand_Aiways,Brand_Audi,Brand_BMW,Brand_BYD,...,Brand_Smart,Brand_SsangYong,Brand_Subaru,Brand_Tesla,Brand_Toyota,Brand_VinFast,Brand_Volkswagen,Brand_Volvo,Brand_Zeekr,Brand_e.Go
0,0.190108,-0.718326,0.490788,0.604933,0.982968,-0.744875,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,-0.658714,-1.792093,0.616225,0.467185,0.544517,-0.389929,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,-0.513201,-0.380857,-0.763583,-0.359304,-0.579015,-0.002715,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,-0.454996,-0.749006,0.323538,-0.083808,-0.579015,0.190892,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0.190108,-1.423945,0.950724,1.247758,0.544517,-0.938481,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False



Target Variable (y):


Unnamed: 0,Price.DE.
0,59017.0
1,46220.0
2,44625.0
3,39990.0
4,55220.0


## Feature selection

### Subtask:
Select the features that will be used to train the models based on the EDA and correlation analysis.


**Reasoning**:
Select the features based on the EDA and correlation analysis and create the selected features dataframe.



In [57]:
# Based on EDA and correlation analysis (Step 3.3 and 3.4),
# the features with moderate to strong correlation with Price.DE. are:
# 'Top_speed' (0.763), 'Battery' (0.704), 'Fast_charge' (0.619), 'Range' (0.594)
# 'acceleration..0.100.' (-0.538) also shows a moderate negative correlation.
# 'Efficiency' (0.179) shows a weak positive correlation.
# The 'Brand' categorical feature also showed significant price variation across brands (Step 3.7).

# Let's select features with an absolute correlation greater than a threshold (e.g., 0.4)
# plus the one-hot encoded 'Brand' features.
correlation_threshold = 0.4

# Get numerical features with correlation above the threshold
numerical_features_correlated = price_correlation[abs(price_correlation) > correlation_threshold].index.tolist()
# Remove 'Price.DE.' from this list as it's the target
if 'Price.DE.' in numerical_features_correlated:
    numerical_features_correlated.remove('Price.DE.')

# Get the one-hot encoded brand columns from the processed X DataFrame
brand_columns = [col for col in X.columns if col.startswith('Brand_')]

# Combine the selected numerical features and brand columns
selected_features_list = numerical_features_correlated + brand_columns

# Create the new DataFrame X_selected with only the selected features
X_selected = X[selected_features_list].copy()

# Display the first few rows of the X_selected DataFrame
print("Selected Features (X_selected):")
display(X_selected.head())

print(f"\nNumber of selected features: {len(selected_features_list)}")
print("Selected features list created.")

Selected Features (X_selected):


Unnamed: 0,Top_speed,Battery,Fast_charge,Range,acceleration..0.100.,Brand_Aiways,Brand_Audi,Brand_BMW,Brand_BYD,Brand_CUPRA,...,Brand_Smart,Brand_SsangYong,Brand_Subaru,Brand_Tesla,Brand_Toyota,Brand_VinFast,Brand_Volkswagen,Brand_Volvo,Brand_Zeekr,Brand_e.Go
0,0.982968,0.190108,0.490788,0.604933,-0.744875,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,0.544517,-0.658714,0.616225,0.467185,-0.389929,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,-0.579015,-0.513201,-0.763583,-0.359304,-0.002715,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,-0.579015,-0.454996,0.323538,-0.083808,0.190892,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0.544517,0.190108,0.950724,1.247758,-0.938481,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False



Number of selected features: 50
Selected features list created.


## Split data

### Subtask:
Split the dataset into training and testing sets.


**Reasoning**:
Split the data into training and testing sets and print their shapes.



In [58]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (247, 50)
Shape of X_test: (62, 50)
Shape of y_train: (247,)
Shape of y_test: (62,)


## Train models

### Subtask:
Train several regression models on the training data.


**Reasoning**:
Import the necessary model classes and create a dictionary of instantiated models, then train each model on the training data.



In [59]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Create a dictionary of regression models
trained_models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(random_state=42),
    'Lasso Regression': Lasso(random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Train each model
print("Training regression models...")
for name, model in trained_models.items():
    model.fit(X_train, y_train)
    print(f"‚úÖ {name} trained successfully.")

print("\nAll models trained.")

Training regression models...
‚úÖ Linear Regression trained successfully.
‚úÖ Ridge Regression trained successfully.
‚úÖ Lasso Regression trained successfully.
‚úÖ Decision Tree trained successfully.
‚úÖ Random Forest trained successfully.
‚úÖ Gradient Boosting trained successfully.

All models trained.


## Evaluate models

### Subtask:
Evaluate the performance of each trained model using appropriate metrics.


**Reasoning**:
Evaluate each trained model using MSE, RMSE, MAE, and R2 scores on the test set and store the results in a DataFrame for comparison.



In [60]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np

# Initialize a dictionary to store evaluation results
evaluation_results = {}

# Iterate through the trained models and evaluate each one
print("Evaluating regression models...")
for name, model in trained_models.items():
    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse) # Calculate RMSE from MSE
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store the metrics
    evaluation_results[name] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
    print(f"‚úÖ {name} evaluation complete.")

# Convert the results to a DataFrame
evaluation_df = pd.DataFrame(evaluation_results).T

# Sort the DataFrame by R2 score for easy comparison
evaluation_df = evaluation_df.sort_values(by='R2', ascending=False)

# Display the evaluation results
print("\nModel Evaluation Results:")
display(evaluation_df)

print("\nEvaluation complete. Results are displayed above.")

Evaluating regression models...
‚úÖ Linear Regression evaluation complete.
‚úÖ Ridge Regression evaluation complete.
‚úÖ Lasso Regression evaluation complete.
‚úÖ Decision Tree evaluation complete.
‚úÖ Random Forest evaluation complete.
‚úÖ Gradient Boosting evaluation complete.

Model Evaluation Results:


Unnamed: 0,MSE,RMSE,MAE,R2
Gradient Boosting,86165060.0,9282.513779,6012.138275,0.941884
Random Forest,118312700.0,10877.165681,7625.919708,0.920202
Linear Regression,226592500.0,15052.988501,10732.410389,0.84717
Lasso Regression,226658300.0,15055.175302,10731.643342,0.847126
Ridge Regression,258325800.0,16072.517134,11196.971966,0.825767
Decision Tree,275951300.0,16611.783094,10715.806452,0.813879



Evaluation complete. Results are displayed above.


## Select best model

### Subtask:
Choose the best performing model based on the evaluation metrics.


**Reasoning**:
Identify and store the best performing model based on the R2 score from the evaluation results.



In [61]:
# Identify the model with the highest R2 score
best_model_name = evaluation_df['R2'].idxmax()
best_r2_score = evaluation_df['R2'].max()

# Get the actual best performing model object
best_model = trained_models[best_model_name]

# Print the name and R2 score of the best model
print(f"üèÜ Best Performing Model: {best_model_name}")
print(f"   R2 Score: {best_r2_score:.4f}")

üèÜ Best Performing Model: Gradient Boosting
   R2 Score: 0.9419
