<a href="https://colab.research.google.com/github/Vaishnavi-P-Kudalkar/LocalRepo/blob/main/Categorical_features_LinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Method 1: Manual Conversion to Dummy Variables
This involves using pandas' get_dummies function to convert categorical features to binary (dummy) variables and then explicitly dropping one of the dummy variables to avoid multicollinearity.

Method 2: Using OneHotEncoder from sklearn
This involves using OneHotEncoder within a ColumnTransformer to automatically handle the conversion to dummy variables and configure it to drop one category to avoid the dummy variable trap.

```
`# This is formatted as code`
```



In [15]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('/content/carprices.csv')
df

Unnamed: 0,Car_Model,Mileage,Sell_price,Age(yrs)
0,BMW_X5,69000,18000,6
1,BMW_X5,35000,34000,3
2,BMW_X5,57000,26100,5
3,BMW_X5,22500,40000,2
4,BMW_X5,46000,31500,4
5,Audi_A5,59000,29400,5
6,Audi_A5,52000,32000,5
7,Audi_A5,72000,19300,6
8,Audi_A5,91000,12000,8
9,Mercedez_Benz_C_class,67000,22000,6


In [17]:
dummies = pd.get_dummies(df.Car_Model)
merged = pd.concat([df,dummies],axis='columns')
merged

Unnamed: 0,Car_Model,Mileage,Sell_price,Age(yrs),Audi_A5,BMW_X5,Mercedez_Benz_C_class
0,BMW_X5,69000,18000,6,False,True,False
1,BMW_X5,35000,34000,3,False,True,False
2,BMW_X5,57000,26100,5,False,True,False
3,BMW_X5,22500,40000,2,False,True,False
4,BMW_X5,46000,31500,4,False,True,False
5,Audi_A5,59000,29400,5,True,False,False
6,Audi_A5,52000,32000,5,True,False,False
7,Audi_A5,72000,19300,6,True,False,False
8,Audi_A5,91000,12000,8,True,False,False
9,Mercedez_Benz_C_class,67000,22000,6,False,False,True


In [21]:
merged['Audi_A5'] = merged['Audi_A5'].astype(int)
merged['BMW_X5'] = merged['BMW_X5'].astype(int)
merged['Mercedez_Benz_C_class'] = merged['Mercedez_Benz_C_class'].astype(int)
merged

Unnamed: 0,Car_Model,Mileage,Sell_price,Age(yrs),Audi_A5,BMW_X5,Mercedez_Benz_C_class
0,BMW_X5,69000,18000,6,0,1,0
1,BMW_X5,35000,34000,3,0,1,0
2,BMW_X5,57000,26100,5,0,1,0
3,BMW_X5,22500,40000,2,0,1,0
4,BMW_X5,46000,31500,4,0,1,0
5,Audi_A5,59000,29400,5,1,0,0
6,Audi_A5,52000,32000,5,1,0,0
7,Audi_A5,72000,19300,6,1,0,0
8,Audi_A5,91000,12000,8,1,0,0
9,Mercedez_Benz_C_class,67000,22000,6,0,0,1


In [26]:
final = merged.drop(['Car_Model', 'Mercedez_Benz_C_class'],axis='columns')
final

Unnamed: 0,Mileage,Sell_price,Age(yrs),Audi_A5,BMW_X5
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1
4,46000,31500,4,0,1
5,59000,29400,5,1,0
6,52000,32000,5,1,0
7,72000,19300,6,1,0
8,91000,12000,8,1,0
9,67000,22000,6,0,0


In [28]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [30]:
X= final.drop('Sell_price', axis='columns')
X

Unnamed: 0,Mileage,Age(yrs),Audi_A5,BMW_X5
0,69000,6,0,1
1,35000,3,0,1
2,57000,5,0,1
3,22500,2,0,1
4,46000,4,0,1
5,59000,5,1,0
6,52000,5,1,0
7,72000,6,1,0
8,91000,8,1,0
9,67000,6,0,0


In [32]:
y = final.Sell_price
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell_price, dtype: int64

In [33]:
model.fit(X,y)

In [35]:
model.predict([[83000,7,0,0]])



array([18929.31674102])

In [38]:
model.score(X,y)

0.9417050937281082

In [41]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [45]:
dfle = df
dfle.Car_Model = le.fit_transform(dfle.Car_Model)
dfle

Unnamed: 0,Car_Model,Mileage,Sell_price,Age(yrs)
0,1,69000,18000,6
1,1,35000,34000,3
2,1,57000,26100,5
3,1,22500,40000,2
4,1,46000,31500,4
5,0,59000,29400,5
6,0,52000,32000,5
7,0,72000,19300,6
8,0,91000,12000,8
9,2,67000,22000,6


In [64]:
X = dfle[['Car_Model','Mileage','Age(yrs)']].values
X

array([[    1, 69000,     6],
       [    1, 35000,     3],
       [    1, 57000,     5],
       [    1, 22500,     2],
       [    1, 46000,     4],
       [    0, 59000,     5],
       [    0, 52000,     5],
       [    0, 72000,     6],
       [    0, 91000,     8],
       [    2, 67000,     6],
       [    2, 83000,     7],
       [    2, 79000,     7],
       [    2, 59000,     5]])

In [65]:
y = df.Sell_price
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell_price, dtype: int64

In [71]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(drop='first'), ['Car Model'])  # Drop first to avoid dummy variable trap
    ],
    remainder='passthrough'
)

# Prepare the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', ct),
    ('regressor', LinearRegression())
])

In [67]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'categorical_features'

In [76]:
X= dfle.drop('Sell_price',axis='columns')
X

Unnamed: 0,Car_Model,Mileage,Age(yrs)
0,1,69000,6
1,1,35000,3
2,1,57000,5
3,1,22500,2
4,1,46000,4
5,0,59000,5
6,0,52000,5
7,0,72000,6
8,0,91000,8
9,2,67000,6


In [77]:
X=[:,:1]
X

SyntaxError: invalid syntax (<ipython-input-77-ad9b2f52dff8>, line 1)

In [78]:
model.fit(X,y)

In [80]:
model.predict([[2,83000,7]])



array([18443.76378491])