In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import warnings
pio.renderers.default = "svg"

In [2]:
raw_df = pd.read_csv("Sport car price.csv")
raw_df

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
0,Porsche,911,2022,3,379,331,4,101200
1,Lamborghini,Huracan,2021,5.2,630,443,2.8,274390
2,Ferrari,488 GTB,2022,3.9,661,561,3,333750
3,Audi,R8,2022,5.2,562,406,3.2,142700
4,McLaren,720S,2021,4,710,568,2.7,298000
...,...,...,...,...,...,...,...,...
1002,Koenigsegg,Jesko,2022,5,1280,1106,2.5,3000000
1003,Lotus,Evija,2021,Electric Motor,1972,1254,2,2000000
1004,McLaren,Senna,2021,4,789,590,2.7,1000000
1005,Pagani,Huayra,2021,6,764,738,3,2600000


In [3]:
raw_df.shape

(1007, 8)

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Car Make                 1007 non-null   object
 1   Car Model                1007 non-null   object
 2   Year                     1007 non-null   int64 
 3   Engine Size (L)          997 non-null    object
 4   Horsepower               1007 non-null   object
 5   Torque (lb-ft)           1004 non-null   object
 6   0-60 MPH Time (seconds)  1007 non-null   object
 7   Price (in USD)           1007 non-null   object
dtypes: int64(1), object(7)
memory usage: 63.1+ KB


In [5]:
raw_df.describe()

Unnamed: 0,Year
count,1007.0
mean,2021.201589
std,2.019802
min,1965.0
25%,2021.0
50%,2021.0
75%,2022.0
max,2023.0


Clean the dataset

In [6]:
#1) check null values 
raw_df.isnull().sum()

Car Make                    0
Car Model                   0
Year                        0
Engine Size (L)            10
Horsepower                  0
Torque (lb-ft)              3
0-60 MPH Time (seconds)     0
Price (in USD)              0
dtype: int64

In [7]:
raw_df[raw_df['Engine Size (L)'].isnull()]

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
168,Rimac,C_Two,2022,,1914,1696.0,1.9,2400000
171,Tesla,Model S Plaid,2021,,1020,1050.0,1.98,131190
222,Porsche,Taycan Turbo S,2021,,750,774.0,2.6,185000
247,Tesla,Model S Plaid,2022,,1020,1050.0,1.9,131190
387,Rimac,C_Two,2022,,1888,1696.0,1.8,2400000
389,Tesla,Roadster,2022,,10000+,0.0,1.9,200000
686,Rimac,C_Two,2022,,1914,1696.0,1.85,2400000
697,Lotus,Evija,2022,,1972,1254.0,2.5,2700000
752,Porsche,Taycan,2022,,469,479.0,3.8,79900
916,Tesla,Roadster,2022,,"10,000+",,1.9,200000


In [8]:
# check using car make and model to compare all relavent data's
raw_df[raw_df['Car Make'] == 'Tesla'] 

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
99,Tesla,Roadster,2022,Electric,1000+,737,1.9,200000
171,Tesla,Model S Plaid,2021,,1020,1050,1.98,131190
247,Tesla,Model S Plaid,2022,,1020,1050,1.9,131190
300,Tesla,Model S,2022,Electric,1020,1050,1.98,119000
320,Tesla,Model S Plaid,2021,Electric,1020,1050,1.9,131100
335,Tesla,Roadster,2022,-,1000+,-,1.9,200000
354,Tesla,Roadster,2022,Electric,1000+,"10,000+",1.9,200000
364,Tesla,Roadster,2023,Electric,"1,000+",737,< 1.9,200000
389,Tesla,Roadster,2022,,10000+,0,1.9,200000
401,Tesla,Model S Plaid,2021,Electric (tri-motor),1020,1050,1.9,141190


In [9]:
raw_df[raw_df['Car Model'] == 'Roadster']

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
99,Tesla,Roadster,2022,Electric,1000+,737,1.9,200000
335,Tesla,Roadster,2022,-,1000+,-,1.9,200000
354,Tesla,Roadster,2022,Electric,1000+,"10,000+",1.9,200000
364,Tesla,Roadster,2023,Electric,"1,000+",737,< 1.9,200000
389,Tesla,Roadster,2022,,10000+,0,1.9,200000
821,Tesla,Roadster,2022,Electric,1000,737,1.9,200000
885,Tesla,Roadster,2022,0,10000,7376,1.9,200000
916,Tesla,Roadster,2022,,"10,000+",,1.9,200000


In [10]:
# Display rows where 'Engine Size (L)' is null
#print(raw_df[raw_df['Engine Size (L)'].isnull()])

# Display rows where 'Car Make' is 'Rimac'
#print(raw_df[raw_df['Car Make'] == 'Rimac'])

# Fill missing 'Engine Size (L)' values with 'Electric' where 'Car Model' is 'C-Two'
raw_df.loc[(raw_df['Car Model'] == 'C_Two') & (raw_df['Engine Size (L)'].isnull()), 'Engine Size (L)'] = 'Electric'
raw_df.loc[(raw_df['Car Model'] == 'Model S Plaid') & (raw_df['Engine Size (L)'].isnull()), 'Engine Size (L)'] = 'Electric'
raw_df.loc[(raw_df['Car Model'] == 'Taycan Turbo S') & (raw_df['Engine Size (L)'].isnull()), 'Engine Size (L)'] = 'Electric'
raw_df.loc[(raw_df['Car Model'] == 'Roadster') & (raw_df['Engine Size (L)'].isnull()), 'Engine Size (L)'] = 'Electric'
raw_df.loc[(raw_df['Car Model'] == 'Taycan') & (raw_df['Engine Size (L)'].isnull()), 'Engine Size (L)'] = 'Electric'
raw_df.loc[(raw_df['Car Model'] == 'Evija') & (raw_df['Engine Size (L)'].isnull()), 'Engine Size (L)'] = 'Electric'


# Verify if null values are filled
#print(raw_df[raw_df['Car Model'] == 'C_Two'])
#print(raw_df[raw_df['Car Model'] == 'Model S Plaid'])
#print(raw_df[raw_df['Car Model'] == 'Taycan Turbo S'])
#print(raw_df[raw_df['Car Model'] == 'Roadster'])
#print(raw_df[raw_df['Car Model'] == 'Taycan'])
print(raw_df[raw_df['Car Model'] == 'Evija'])


     Car Make Car Model  Year Engine Size (L) Horsepower Torque (lb-ft)  \
420     Lotus     Evija  2022        Electric       1973           1254   
523     Lotus     Evija  2022        Electric       1973           1254   
697     Lotus     Evija  2022        Electric       1972           1254   
877     Lotus     Evija  2021        Electric       2000           1254   
987     Lotus     Evija  2022        Electric       1973           1254   
1003    Lotus     Evija  2021  Electric Motor       1972           1254   

     0-60 MPH Time (seconds) Price (in USD)  
420                      2.5      2,750,000  
523                      2.5      2,600,000  
697                      2.5      2,700,000  
877                      2.8      2,800,000  
987                      2.5      2,000,000  
1003                       2      2,000,000  


In [11]:
#2) null value column =Torque (lb-ft)             3

In [12]:
raw_df[raw_df['Torque (lb-ft)'].isnull()]

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
642,Tesla,Model S Plaid,2021,Electric,1020,,1.9,139990
878,Maserati,GranTurismo,2021,Electric,550,,2.8,200000
916,Tesla,Roadster,2022,Electric,"10,000+",,1.9,200000


In [13]:
# check using car make and model to compare all relavent data's
raw_df[raw_df['Car Make'] == 'Tesla'] 

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
99,Tesla,Roadster,2022,Electric,1000+,737,1.9,200000
171,Tesla,Model S Plaid,2021,Electric,1020,1050,1.98,131190
247,Tesla,Model S Plaid,2022,Electric,1020,1050,1.9,131190
300,Tesla,Model S,2022,Electric,1020,1050,1.98,119000
320,Tesla,Model S Plaid,2021,Electric,1020,1050,1.9,131100
335,Tesla,Roadster,2022,-,1000+,-,1.9,200000
354,Tesla,Roadster,2022,Electric,1000+,"10,000+",1.9,200000
364,Tesla,Roadster,2023,Electric,"1,000+",737,< 1.9,200000
389,Tesla,Roadster,2022,Electric,10000+,0,1.9,200000
401,Tesla,Model S Plaid,2021,Electric (tri-motor),1020,1050,1.9,141190


In [14]:
raw_df[raw_df['Car Model'] == 'Roadster']

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
99,Tesla,Roadster,2022,Electric,1000+,737,1.9,200000
335,Tesla,Roadster,2022,-,1000+,-,1.9,200000
354,Tesla,Roadster,2022,Electric,1000+,"10,000+",1.9,200000
364,Tesla,Roadster,2023,Electric,"1,000+",737,< 1.9,200000
389,Tesla,Roadster,2022,Electric,10000+,0,1.9,200000
821,Tesla,Roadster,2022,Electric,1000,737,1.9,200000
885,Tesla,Roadster,2022,0,10000,7376,1.9,200000
916,Tesla,Roadster,2022,Electric,"10,000+",,1.9,200000


In [15]:

# Display rows where 'Engine Size (L)' is null
#print(raw_df[raw_df['Engine Size (L)'].isnull()])

# Display rows where 'Car Make' is 'Rimac'
#print(raw_df[raw_df['Car Make'] == 'Rimac'])

# Fill missing 'Engine Size (L)' values with 'Electric' where 'Car Model' is 'C-Two'
raw_df.loc[(raw_df['Car Model'] == 'Model S Plaid') & (raw_df['Torque (lb-ft)'].isnull()), 'Torque (lb-ft)'] = '1050'
raw_df.loc[(raw_df['Car Model'] == 'GranTurismo') & (raw_df['Torque (lb-ft)'].isnull()), 'Torque (lb-ft)'] = '996'
raw_df.loc[(raw_df['Car Model'] == 'Roadster') & (raw_df['Torque (lb-ft)'].isnull()), 'Torque (lb-ft)'] = '737'


# Verify if null values are filled
#print(raw_df[raw_df['Car Model'] == 'Model S Plaid'])
#print(raw_df[raw_df['Car Model'] == 'GranTurismo'])
#print(raw_df[raw_df['Car Model'] == 'Roadster'])


In [16]:
raw_df.isna().sum()

Car Make                   0
Car Model                  0
Year                       0
Engine Size (L)            0
Horsepower                 0
Torque (lb-ft)             0
0-60 MPH Time (seconds)    0
Price (in USD)             0
dtype: int64

In [17]:
raw_df['Engine Size (L)'].unique()

array(['3', '5.2', '3.9', '4', '4.4', '6.2', '3.8', '8', '5', '3.5',
       '4.7', '2', '2.9', '6', 'Electric', '6.5', '3.7', 'Electric Motor',
       '2.5', '1.5 + Electric', '6.8', '8.4', '6.6', '7', '1.7', '3.3',
       '-', '6.7', '1.8', 'Electric (tri-motor)', '5.5',
       'Electric (93 kWh)', 'Electric (100 kWh)', 'Hybrid (4.0)', '4.6',
       '3.6', '1.5', 'Hybrid', '5.7', '2.0 (Electric)', '4.0 (Hybrid)',
       '0', '6.4', '6.3', '2.3'], dtype=object)

In [18]:
import numpy as np

# Create a new column 'Vehicle Type' based on 'Engine Size (L)'
raw_df['Vehicle Type'] = np.select(
    [
        raw_df['Engine Size (L)'].isin(['Electric', 'Electric Motor', 'Electric (93 kWh)', 'Electric (100 kWh)', '2.0 (Electric)']),
        raw_df['Engine Size (L)'].isin(['Hybrid', 'Hybrid (4.0)']),
        raw_df['Engine Size (L)'].isin(['1.5 + Electric', 'Electric (tri-motor)'])
    ],
    ['EV', 'Hybrid', 'EV'],
    default='Fuel'
)

# Replace 'Engine Size (L)' values based on certain conditions
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace({
    'Electric': 0,
    'Hybrid': 1,
    '1.5 + Electric': 1.5,
    'Electric (tri-motor)':0,
    'Electric Motor': 0,
    'Electric (93 kWh)': 0,
    'Electric (100 kWh)': 0,
    'Hybrid (4.0)': 4.0,
    '2.0 (Electric)': 2.0,
    '4.0 (Hybrid)': 4.0
})

# Show the updated DataFrame
print(raw_df.tail())


        Car Make Car Model  Year Engine Size (L) Horsepower Torque (lb-ft)  \
1002  Koenigsegg     Jesko  2022               5       1280           1106   
1003       Lotus     Evija  2021               0       1972           1254   
1004     McLaren     Senna  2021               4        789            590   
1005      Pagani    Huayra  2021               6        764            738   
1006       Rimac    Nevera  2021               0       1888           1696   

     0-60 MPH Time (seconds) Price (in USD) Vehicle Type  
1002                     2.5      3,000,000         Fuel  
1003                       2      2,000,000           EV  
1004                     2.7      1,000,000         Fuel  
1005                       3      2,600,000         Fuel  
1006                    1.85      2,400,000           EV  


In [19]:
raw_df = raw_df[raw_df['Engine Size (L)'] != '-']

In [20]:
"""import numpy as np

# Assign 'EV' for 'Engine Size (L)' equal to 'EV', else assign 'Fuel'
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric', 'EV', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric Motor', 'EV', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == '1.5 + Electric', 'EV', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric (tri-motor)', 'EV', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric (93 kWh)', 'EV', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric (100 kWh)', 'EV', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Hybrid (4.0)', 'Hybrid', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Hybrid', 'Hybrid', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == '2.0 (Electric)', 'EV', 'Fuel')
raw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == '4.0 (Hybrid)', 'Hybrid', 'Fuel')




# Assign 'Hybrid' for 'Engine Size (L)' equal to 'Hybrid', else retain the existing value
#new_dataset['Vehicle Type'] = np.where(new_dataset['Engine Size (L)'] == 'Hybrid', 'Hybrid', new_dataset['Vehicle Type'])

raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('Electric', 0)
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('Hybrid', 1)
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('1.5 + Electric', 1.5)
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('Electric Motor', 0)
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('Electric (93 kWh)', 0)
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('Electric (100 kWh)', 0)
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('Hybrid (4.0)', 4.0)
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('2.0 (Electric)', 2.0)
raw_df['Engine Size (L)'] = raw_df['Engine Size (L)'].replace('4.0 (Hybrid)', 4.0)




# Check the result
raw_df['Vehicle Type'].unique()"""


"import numpy as np\n\n# Assign 'EV' for 'Engine Size (L)' equal to 'EV', else assign 'Fuel'\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric', 'EV', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric Motor', 'EV', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == '1.5 + Electric', 'EV', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric (tri-motor)', 'EV', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric (93 kWh)', 'EV', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Electric (100 kWh)', 'EV', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Hybrid (4.0)', 'Hybrid', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == 'Hybrid', 'Hybrid', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw_df['Engine Size (L)'] == '2.0 (Electric)', 'EV', 'Fuel')\nraw_df['Vehicle Type'] = np.where(raw

In [21]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1006 entries, 0 to 1006
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Car Make                 1006 non-null   object
 1   Car Model                1006 non-null   object
 2   Year                     1006 non-null   int64 
 3   Engine Size (L)          1006 non-null   object
 4   Horsepower               1006 non-null   object
 5   Torque (lb-ft)           1006 non-null   object
 6   0-60 MPH Time (seconds)  1006 non-null   object
 7   Price (in USD)           1006 non-null   object
 8   Vehicle Type             1006 non-null   object
dtypes: int64(1), object(8)
memory usage: 78.6+ KB


In [22]:
#raw_df.to_csv("new.csv")

In [23]:
# Handle Price column 
# Remove the comma and convert to numeric
raw_df['Price (in USD)'] = raw_df['Price (in USD)'].apply(lambda x : x.replace(',', ''))
raw_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD),Vehicle Type
0,Porsche,911,2022,3.0,379,331,4.0,101200,Fuel
1,Lamborghini,Huracan,2021,5.2,630,443,2.8,274390,Fuel
2,Ferrari,488 GTB,2022,3.9,661,561,3.0,333750,Fuel
3,Audi,R8,2022,5.2,562,406,3.2,142700,Fuel
4,McLaren,720S,2021,4.0,710,568,2.7,298000,Fuel


In [24]:
raw_df['Price (in USD)'].min()

'100000'

In [25]:
raw_df['Price (in USD)'].max()

'99990'

In [26]:
import pandas as pd
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
le = LabelEncoder()

# Apply Label Encoding to 'Car Model' column
raw_df['Car Model'] = le.fit_transform(raw_df['Car Model'])
raw_df['Car Make'] = le.fit_transform(raw_df['Car Make'])
raw_df['Vehicle Type'] = le.fit_transform(raw_df['Vehicle Type'])


# Print the updated dataset
print(raw_df)


      Car Make  Car Model  Year Engine Size (L) Horsepower Torque (lb-ft)  \
0           28         18  2022               3        379            331   
1           16         98  2021             5.2        630            443   
2           11          5  2022             3.9        661            561   
3            5        130  2022             5.2        562            406   
4           21         15  2021               4        710            568   
...        ...        ...   ...             ...        ...            ...   
1002        15         99  2022               5       1280           1106   
1003        18         76  2021               0       1972           1254   
1004        21        157  2021               4        789            590   
1005        25         95  2021               6        764            738   
1006        29        122  2021               0       1888           1696   

     0-60 MPH Time (seconds) Price (in USD)  Vehicle Type  
0              

In [27]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1006 entries, 0 to 1006
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Car Make                 1006 non-null   int32 
 1   Car Model                1006 non-null   int32 
 2   Year                     1006 non-null   int64 
 3   Engine Size (L)          1006 non-null   object
 4   Horsepower               1006 non-null   object
 5   Torque (lb-ft)           1006 non-null   object
 6   0-60 MPH Time (seconds)  1006 non-null   object
 7   Price (in USD)           1006 non-null   object
 8   Vehicle Type             1006 non-null   int32 
dtypes: int32(3), int64(1), object(5)
memory usage: 66.8+ KB


In [28]:
# replace the typo words <1.9
raw_df['0-60 MPH Time (seconds)'] = raw_df['0-60 MPH Time (seconds)'].str.replace('<', '', regex=False).astype(float)

In [29]:
raw_df['Torque (lb-ft)'].unique()

array(['331', '443', '561', '406', '568', '553', '494', '465', '625',
       '481', '516', '1180', '656', '295', '1015', '398', '317', '384',
       '280', '243', '664', '531', '468', '737', '738', '1696', '309',
       '590', '479', '650', '550', '276', '626', '369', '420', '627',
       '455', '505', '560', '457', '707', '270', '354', '476', '339',
       '1106', '151', '605', '368', '723', '642', '509', '604', '507',
       '513', '600', '440', '1050', '708', '774', '254', '663', '332',
       '530', '470', '258', '290', '413', '1732', '376', '10,000+', '236',
       '0', '472', '1254', '848', '1300', '442', '641', '498', '350',
       '944', '268', '184', '400', '263', '996', '7,376', '1,180', '475',
       '1,050', '740', '538'], dtype=object)

In [30]:
raw_df[raw_df['Torque (lb-ft)'] == '10,000+'] 
raw_df['Torque (lb-ft)'] = pd.to_numeric(raw_df['Torque (lb-ft)'].replace({'10,000+': '885'}), errors='coerce')


In [31]:
raw_df['Torque (lb-ft)'].unique()

array([ 331.,  443.,  561.,  406.,  568.,  553.,  494.,  465.,  625.,
        481.,  516., 1180.,  656.,  295., 1015.,  398.,  317.,  384.,
        280.,  243.,  664.,  531.,  468.,  737.,  738., 1696.,  309.,
        590.,  479.,  650.,  550.,  276.,  626.,  369.,  420.,  627.,
        455.,  505.,  560.,  457.,  707.,  270.,  354.,  476.,  339.,
       1106.,  151.,  605.,  368.,  723.,  642.,  509.,  604.,  507.,
        513.,  600.,  440., 1050.,  708.,  774.,  254.,  663.,  332.,
        530.,  470.,  258.,  290.,  413., 1732.,  376.,  885.,  236.,
          0.,  472., 1254.,  848., 1300.,  442.,  641.,  498.,  350.,
        944.,  268.,  184.,  400.,  263.,  996.,   nan,  475.,  740.,
        538.])

In [43]:
raw_df.to_csv("new.csv")

In [33]:
#housepower
raw_df['Horsepower'].unique()

array(['379', '630', '661', '562', '710', '617', '523', '490', '760',
       '600', '1500', '717', '296', '1280', '471', '416', '454', '300',
       '505', '320', '626', '671', '622', '720', '1914', '414', '759',
       '986', '591', '503', '650', '660', '350', '641', '611', '394',
       '612', '369', '603', '455', '460', '325', '349', '592', '444',
       '405', '797', '770', '332', '473', '480', '573', '380', '1600',
       '181', '620', '764', '624', '1000+', '382', '800', '715', '690',
       '730', '469', '365', '401', '645', '435', '1020', '500', '780',
       '750', '402', '575', '729', '789', '577', '495', '237', '310',
       '791', '1874', '542', '368', '616', '1479', '755', '1,000+', '288',
       '1888', '10000+', '482', '1973', '1262', '1035', '819', '385',
       '647', '1200', '1578', '625', '583', '429', '563', '400', '707',
       '887', '1972', '305', '640', '255', '689', '372', '1000', '2000',
       '550', '10,000', '1,500', '10,000+', '485', '1,020', '1872', '621'

In [34]:
raw_df[raw_df['Horsepower'] == '10000+'] 
raw_df[raw_df['Horsepower'] == '1,000+'] 
raw_df[raw_df['Horsepower'] == '10,000+'] 

raw_df['Horsepower'] = pd.to_numeric(raw_df['Horsepower'].replace({'10000+': '885'}), errors='coerce')
raw_df['Horsepower'] = pd.to_numeric(raw_df['Horsepower'].replace({'1,000+': '885'}), errors='coerce')
raw_df['Horsepower'] = pd.to_numeric(raw_df['Horsepower'].replace({'10,000+': '885'}), errors='coerce')


In [39]:
raw_df.isna().sum()

Car Make                   0
Car Model                  0
Year                       0
Engine Size (L)            0
Horsepower                 3
Torque (lb-ft)             1
0-60 MPH Time (seconds)    0
Price (in USD)             0
Vehicle Type               0
dtype: int64

In [36]:
raw_df = raw_df.drop(index=[364, 885, 898, 100, 354])

In [37]:
raw_df['Horsepower'].dropna()

0        379.0
1        630.0
2        661.0
3        562.0
4        710.0
         ...  
1002    1280.0
1003    1972.0
1004     789.0
1005     764.0
1006    1888.0
Name: Horsepower, Length: 998, dtype: float64

In [45]:
duplicate_rows = raw_df[raw_df.duplicated()]
duplicate_rows.value_counts().sum()


289

In [42]:
import pandas as pd

# Columns to convert
cols_to_convert = ["Engine Size (L)", "Horsepower", "Torque (lb-ft)", "0-60 MPH Time (seconds)", "Price (in USD)"]

# Convert each column to numeric (handling errors)
for col in cols_to_convert:
    raw_df[col] = pd.to_numeric(raw_df[col], errors='coerce')  # Convert non-numeric to NaN

# Convert to integer, handling NaNs
raw_df[cols_to_convert] = raw_df[cols_to_convert].fillna(0).astype(int)

# Display updated DataFrame info
print(raw_df.info())


<class 'pandas.core.frame.DataFrame'>
Index: 1001 entries, 0 to 1006
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Car Make                 1001 non-null   int32
 1   Car Model                1001 non-null   int32
 2   Year                     1001 non-null   int64
 3   Engine Size (L)          1001 non-null   int32
 4   Horsepower               1001 non-null   int32
 5   Torque (lb-ft)           1001 non-null   int32
 6   0-60 MPH Time (seconds)  1001 non-null   int32
 7   Price (in USD)           1001 non-null   int32
 8   Vehicle Type             1001 non-null   int32
dtypes: int32(8), int64(1)
memory usage: 46.9 KB
None


In [44]:
raw_df.isna().sum()

Car Make                   0
Car Model                  0
Year                       0
Engine Size (L)            0
Horsepower                 0
Torque (lb-ft)             0
0-60 MPH Time (seconds)    0
Price (in USD)             0
Vehicle Type               0
dtype: int64