## DATA WRANGLING

In [138]:
import pandas as pd
import numpy as np

In [140]:
df = pd.read_csv("/Users/cansezgin/Python-Output/automobile.csv")

#### The first method: drop the rows or columns with missing values

In [143]:
df.dropna(subset=["price"], axis=0, inplace=True)

# Axis = 0 drops the entire row
# Axis = 1 drops the entire column
# Inplace = True writes the result back into the dataframe

#### The second method: Replace the missing values with column mean

In [146]:
# Convert the column to numeric, forcing non-numeric entries to NaN
df["normalized-losses"] = pd.to_numeric(df["normalized-losses"], errors='coerce')

# Now compute the mean, skipping NaN values
mean = df["normalized-losses"].mean()

print("Mean of normalized-losses:", mean)

Mean of normalized-losses: 122.0


In [148]:
# Replace NaN values with the mean
df["normalized-losses"].fillna(mean, inplace=True)

# Optional: confirm there are no more missing values
print(df["normalized-losses"].isnull().sum())

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["normalized-losses"].fillna(mean, inplace=True)


### Data Formating

In [153]:
df["city-mpg"] = 235/df["city-mpg"]

In [159]:
df.rename(columns={"city-mpg":"city-L/100km"}, inplace=True)

#### Convert Data Type

In [166]:
df["price"].tail(5)

200    16845
201    19045
202    21485
203    22470
204    22625
Name: price, dtype: object

In [175]:
# Step 1: Convert to numeric, turn "?" into NaN
df["price"] = pd.to_numeric(df["price"], errors="coerce")

# Step 2: Fill missing values with the mean
mean_price = df["price"].mean()
df["price"] = df["price"].fillna(mean_price)

In [177]:
# Step 3: Convert to integer
df["price"] = df["price"].astype(int)

In [181]:
df["price"].tail(5)

200    16845
201    19045
202    21485
203    22470
204    22625
Name: price, dtype: int64

### Data Normalisation

#### (1) Simple Feature Scaling

- X new = X old / X max

In [202]:
df["length"] = df["length"]/df["length"].max()

In [204]:
df["length"]

0      0.811148
1      0.811148
2      0.822681
3      0.848630
4      0.848630
         ...   
200    0.907256
201    0.907256
202    0.907256
203    0.907256
204    0.907256
Name: length, Length: 205, dtype: float64

#### (2) Min-Max

- X new = X old - X min / X max - X min

In [206]:
df["length"] = (df["length"]-df["length"].min())/(df["length"].max()-df["length"].min())

In [208]:
df["length"]

0      0.413433
1      0.413433
2      0.449254
3      0.529851
4      0.529851
         ...   
200    0.711940
201    0.711940
202    0.711940
203    0.711940
204    0.711940
Name: length, Length: 205, dtype: float64

#### (3) Z-Score

- X new = X old - M / Q

In [212]:
df["length"] = (df["length"]-df["length"].mean())/df["length"].std()

In [214]:
df["length"]

0     -0.425480
1     -0.425480
2     -0.230948
3      0.206750
4      0.206750
         ...   
200    1.195622
201    1.195622
202    1.195622
203    1.195622
204    1.195622
Name: length, Length: 205, dtype: float64

### Binning in Python

In [217]:
bins = np.linspace(min(df["price"]), max(df["price"]),4)

In [219]:
group_names = ["Low", "Medium", "High"]

In [221]:
df["price-binned"] = pd.cut(df["price"], bins, labels = group_names, include_lowest = True)

In [225]:
df["price-binned"]

0         Low
1         Low
2         Low
3         Low
4         Low
        ...  
200       Low
201    Medium
202    Medium
203    Medium
204    Medium
Name: price-binned, Length: 205, dtype: category
Categories (3, object): ['Low' < 'Medium' < 'High']

### Turning Categorical Variables into Quantitative Variables in Python (ONE-HOT-ENCODING)

In [235]:
pd.get_dummies(df["fuel-type"]).astype(int)

Unnamed: 0,diesel,gas
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
200,0,1
201,0,1
202,0,1
203,1,0
