<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/01_Avg%2C_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [139]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import re
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [140]:
df = pd.read_csv('car_prices.csv')
df.shape

(54273, 13)

In [141]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [142]:
for col in df.columns:
    print(f'{col}: {df[col].nunique()}')

id: 54273
brand: 53
model: 1827
model_year: 34
milage: 3212
fuel_type: 7
engine: 1061
transmission: 46
ext_col: 260
int_col: 124
accident: 2
clean_title: 1
price: 1481


In [143]:
def parse_engine_info(engine):
    if pd.isna(engine):
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])

    hp = re.search(r'(\d+\.?\d*)HP', engine)
    liter = re.search(r'(\d+\.?\d*)L', engine)
    motor = re.search(r'(Straight|V)\s*\d*', engine)
    cylinder = re.search(r'(\d+)\s*Cylinder', engine)

    return pd.Series([
        hp.group(1) if hp else np.nan,
        liter.group(1) if liter else np.nan,
        motor.group(0) if motor else np.nan,
        cylinder.group(1) if cylinder else np.nan,
    ])

df[['hp', 'litr', 'motor', 'Cylinder']] = df['engine'].apply(parse_engine_info)

In [144]:
df = df.drop(columns=['clean_title', 'id'])

In [145]:
df.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,hp,litr,motor,Cylinder
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,375.0,3.5,V6,6


In [146]:
df.isna().sum()

Unnamed: 0,0
brand,0
model,0
model_year,0
milage,0
fuel_type,0
engine,0
transmission,0
ext_col,0
int_col,0
accident,0


In [147]:
def fill_nan_with_groupby(df, target_column, groupby_columns, agg_func='mean'):
    fill_values = df.groupby(groupby_columns)[target_column].transform(agg_func)
    df[target_column].fillna(fill_values, inplace=True)
    return df

df['hp'] = pd.to_numeric(df['hp'], errors='coerce')
filled_df = fill_nan_with_groupby(df, 'hp', ['motor', 'Cylinder'])

# nan_groups = filled_df[filled_df['hp'].isna()].groupby(['motor', 'Cylinder']).size()


# print(nan_groups)

filled_df['hp'].fillna(filled_df['hp'].mean(), inplace=True)

In [148]:
df.isna().sum()

Unnamed: 0,0
brand,0
model,0
model_year,0
milage,0
fuel_type,0
engine,0
transmission,0
ext_col,0
int_col,0
accident,0


In [149]:
def fill_nan_with_groupby(df, target_column, groupby_columns, agg_func='mean'):
    fill_values = df.groupby(groupby_columns)[target_column].transform(agg_func)
    df[target_column].fillna(fill_values, inplace=True)
    return df

df['hp'] = pd.to_numeric(df['hp'], errors='coerce')
fill_nan_with_groupby(df, 'hp', ['motor', 'Cylinder'])

# df['hp'].fillna(filled_df['hp'].mean(), inplace=True)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,hp,litr,motor,Cylinder
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,375.000000,3.5,V6,6
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250,300.000000,3.0,Straight 6,6
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000,300.000000,4.2,,8
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500,335.000000,3.0,Straight 6,6
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,7850,200.000000,3.8,V6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54268,BMW,X6 xDrive50i,2017,29000,Gasoline,445.0HP 4.4L 8 Cylinder Engine Gasoline Fuel,8-Speed A/T,White,Brown,None reported,29000,445.000000,4.4,,8
54269,Audi,A4 2.0T Premium,2015,94634,E85 Flex Fuel,220.0HP 2.0L 4 Cylinder Engine Flex Fuel Capab...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,6500,220.000000,2.0,,4
54270,Porsche,Cayenne S,2013,40989,Gasoline,420.0HP 3.6L V6 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,Black,At least 1 accident or damage reported,18950,420.000000,3.6,V6,6
54271,Porsche,911 Carrera 4 GTS,2023,1518,Gasoline,4.0L H6 24V GDI DOHC,8-Speed Automatic with Auto-Shift,Beige,Brown,None reported,194965,331.697277,4.0,V,


In [150]:
df.isna().sum()

Unnamed: 0,0
brand,0
model,0
model_year,0
milage,0
fuel_type,0
engine,0
transmission,0
ext_col,0
int_col,0
accident,0


In [151]:
# mean_prices = df[df['Price'].notna()].groupby(by=['Route', 'Date_of_Journey', 'Additional_Info'])['Price'].mean().reset_index()
# df = pd.merge(df, mean_prices, on=['Route', 'Date_of_Journey', 'Additional_Info'], how='left', suffixes=('', '_mean'))
# df['Price'] = df['Price'].fillna(df['Price_mean'])
# df.drop(columns=['Price_mean'], inplace=True)

In [152]:
# df['hp'] = df['engine'].agg(lambda x: x.split('HP')[0] if "HP" in x else np.nan)
# df['hp'] = pd.to_numeric(df['hp'])

In [153]:
# df['litr'] = df['engine'].agg(lambda x: x.split('L').split(' ')[0] if "L" in x else np.nan)
# df['litr'] = pd.to_numeric(df['litr'])
# df.head()