In [11]:
import pandas as pd
df = pd.read_csv('car data.csv')

# Display the first few rows of the dataframe
print("First few rows of the dataset:")
print(df.head())

# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())

# Display basic statistics and structure
print("\nInformation about the dataset:")
print(df.info())

print("\nBasic statistics of the dataset:")
print(df.describe())

First few rows of the dataset:
  Car_Name  Year  Selling_Price  Present_Price  Driven_kms Fuel_Type  \
0     ritz  2014           3.35           5.59       27000    Petrol   
1      sx4  2013           4.75           9.54       43000    Diesel   
2     ciaz  2017           7.25           9.85        6900    Petrol   
3  wagon r  2011           2.85           4.15        5200    Petrol   
4    swift  2014           4.60           6.87       42450    Diesel   

  Selling_type Transmission  Owner  
0       Dealer       Manual      0  
1       Dealer       Manual      0  
2       Dealer       Manual      0  
3       Dealer       Manual      0  
4       Dealer       Manual      0  

Missing values in the dataset:
Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 t

In [12]:
# Handle missing values (example: drop rows with missing values)
df.dropna(inplace=True)

# Encode categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Display the first few rows to verify encoding
print("\nFirst few rows after encoding categorical variables:")
print(df.head())



First few rows after encoding categorical variables:
   Year  Selling_Price  Present_Price  Driven_kms  Owner  Car_Name_Activa 3g  \
0  2014           3.35           5.59       27000      0               False   
1  2013           4.75           9.54       43000      0               False   
2  2017           7.25           9.85        6900      0               False   
3  2011           2.85           4.15        5200      0               False   
4  2014           4.60           6.87       42450      0               False   

   Car_Name_Activa 4g  Car_Name_Bajaj  ct 100  Car_Name_Bajaj Avenger 150  \
0               False                   False                       False   
1               False                   False                       False   
2               False                   False                       False   
3               False                   False                       False   
4               False                   False                       False   

  

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into features and target
X = df.drop('Selling_Price', axis=1)  # Replace 'price' with the actual target column name
y = df['Selling_Price']  # Replace 'price' with the actual target column name

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 3.935178493502141e+23
R^2 Score: -1.7083041630812318e+22


In [26]:
def predict_car_price(features):
    features_df = pd.DataFrame([features])
    features_df = pd.get_dummies(features_df, columns=categorical_columns, drop_first=True)
    
    # Ensure the features dataframe has the same columns as the training set
    missing_cols = set(X.columns) - set(features_df.columns)
    for col in missing_cols:
        features_df[col] = 0
    features_df = features_df[X.columns]
    
    # Scale the features
    features_scaled = scaler.transform(features_df)
    prediction = model.predict(features_scaled)
    
    return prediction[0]

# Test the function
test_features = {
    'Car_Name': 'ciaz',
    'Present_Price': 9.29,
    'Driven_kms': 37000,
    'Fuel_Type': 'Petrol',
    'Selling_type': 'Dealer',
    'Transmission': 'Automatic',
    'Owner': 0
}

print(f'Predicted Car Price: {predict_car_price(test_features)}')


Predicted Car Price: -1157.666163901742


  features_df[col] = 0
  features_df[col] = 0
  features_df[col] = 0
  features_df[col] = 0
