# Introduction aux données

In [258]:
import pandas as pd


cars = pd.read_csv('auto.csv')


In [259]:
unique_regions = cars['origin'].unique()

In [260]:
print(unique_regions)

[1 3 2]


In [261]:
cars

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1
5,15.0,8,429.0,198.0,4341.0,10.0,70,1
6,14.0,8,454.0,220.0,4354.0,9.0,70,1
7,14.0,8,440.0,215.0,4312.0,8.5,70,1
8,14.0,8,455.0,225.0,4425.0,10.0,70,1
9,15.0,8,390.0,190.0,3850.0,8.5,70,1


# Variables Nominales

In [262]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix='cyl')
cars = pd.concat([cars, dummy_cylinders], axis=1)

In [263]:
dummy_years = pd.get_dummies(cars['year'], prefix='year')

In [264]:
cars = pd.concat([cars,dummy_years], axis=1)

In [265]:
cars = cars.drop('year', axis=1)

In [266]:
cars = cars.drop('cylinders', axis=1)

In [267]:
cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Classification Multi-Classe

In [268]:
import numpy as np

shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

In [269]:
len(shuffled_cars)

392

In [270]:
# Train c'est 70% , test c'est 30%

train = shuffled_cars.iloc[0:274]
test = shuffled_cars.iloc[274:]

# Entrainer un modèle de régression logistiquz multi-classe

In [271]:
unique_origins = cars['origin'].unique()

In [272]:
unique_origins

array([1, 3, 2], dtype=int64)

In [273]:
unique_origins = cars['origin'].unique()
unique_origins.sort()

In [274]:
from sklearn.linear_model import LogisticRegression

features = [c for c in train.columns if c.startswith('cyl_') or c.startswith('year_')]

In [275]:
models = {}

for origin in unique_origins:
    model = LogisticRegression(solver='liblinear')
    X_train = train[features]
    Y_train = train['origin'] == origin #filtre
    model.fit(X_train, Y_train)
    models[origin] = model
    

In [276]:
models

{1: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 2: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 3: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    wa

In [277]:
train.columns.str.startswith("origin")

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False])

# Tester les modèles

In [278]:
cars

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,15.0,429.0,198.0,4341.0,10.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,14.0,454.0,220.0,4354.0,9.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,14.0,440.0,215.0,4312.0,8.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,14.0,455.0,225.0,4425.0,10.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,15.0,390.0,190.0,3850.0,8.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [279]:
testing_probs = pd.DataFrame(columns=unique_origins)

In [314]:
testing_probs

Unnamed: 0,1,2,3
0,0.847807,0.082767,0.068959
1,0.958588,0.022562,0.039345
2,0.304195,0.362105,0.321457
3,0.969839,0.024414,0.024491
4,0.215466,0.402824,0.391049
5,0.969839,0.024414,0.024491
6,0.281821,0.191568,0.526705
7,0.560007,0.126880,0.328202
8,0.785932,0.081024,0.109983
9,0.281821,0.191568,0.526705


In [285]:
for origin in unique_origins:
    prediction = models.get(origin).predict_proba(test[features])
    testing_probs[origin] = prediction[:,1]

In [318]:
predicted_origins = testing_probs.idxmax(axis=1)

In [322]:
predicted_origins.head(30)

0     1
1     1
2     2
3     1
4     2
5     1
6     3
7     1
8     1
9     3
10    3
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    2
24    1
25    1
26    1
27    2
28    1
29    3
dtype: int64

In [321]:
test.head(30)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
16,18.0,199.0,97.0,2774.0,15.5,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
220,17.0,260.0,110.0,4060.0,19.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
75,18.0,121.0,112.0,2933.0,14.5,2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
93,13.0,440.0,215.0,4735.0,11.0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
144,28.0,90.0,75.0,2125.0,14.5,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
86,13.0,350.0,145.0,3988.0,13.0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
348,29.9,98.0,65.0,2380.0,20.7,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
363,27.0,112.0,88.0,2640.0,18.6,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
125,19.0,232.0,100.0,2901.0,16.0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
338,30.0,135.0,84.0,2385.0,12.9,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
