In [1]:
#import librairies


import streamlit as st
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn

In [2]:
df = pd.read_csv("RENDEMENT.csv", sep=';')

In [3]:
df = df.rename({"region":"Region"}, axis=1)
df = df.rename({"culture":"Culture"}, axis=1)

In [4]:
df = df[df["Rendement kg/ha"].notnull()]

In [5]:
df = df.rename({"Rendement kg/ha":"Rendement"}, axis=1)
df = df.rename({"Superficie en ha":"Superficie"}, axis=1)
df = df.rename({"Production en tonne":"Production"}, axis=1)

In [6]:
#suppression des colonnes non prises en compte dans le modele

df = df.drop("Date", axis=1)

In [7]:
df.head()

Unnamed: 0,Region,Culture,Production,Superficie,Rendement
0,DAKAR,ARACHIDE,1884.493889,3140.823149,764.142857
1,DAKAR,ARACHIDE,55.762016,371.74677,150.0
2,DAKAR,ARACHIDE,80.0,400.0,200.0
3,DAKAR,ARACHIDE,556.529307,894.741651,622.0
4,DAKAR,ARACHIDE,741.750042,1211.026408,612.497


In [8]:
df['Culture'].value_counts()


ARACHIDE    331
MAIS        321
NIEBE       315
SORGHO      314
MIL         302
MANIOC      300
RIZ         223
FONIO        62
Name: Culture, dtype: int64

In [9]:
#Nous allons codifier la variable culture

df_dict = {
    'ARACHIDE': 1,
    'MAIS': 2,
    'NIEBE': 3,
    'SORGHO': 4,
    'MIL': 5,
    'MANIOC': 6,
    'RIZ': 7,
    'FONIO': 8
}
df['Culture_num']=df['Culture'].map(df_dict)

In [10]:
df['Culture_num'].value_counts()


1    331
2    321
3    315
4    314
5    302
6    300
7    223
8     62
Name: Culture_num, dtype: int64

In [11]:
df.head()

Unnamed: 0,Region,Culture,Production,Superficie,Rendement,Culture_num
0,DAKAR,ARACHIDE,1884.493889,3140.823149,764.142857,1
1,DAKAR,ARACHIDE,55.762016,371.74677,150.0,1
2,DAKAR,ARACHIDE,80.0,400.0,200.0,1
3,DAKAR,ARACHIDE,556.529307,894.741651,622.0,1
4,DAKAR,ARACHIDE,741.750042,1211.026408,612.497,1


In [12]:
x = df.drop(['Culture','Culture_num'],axis=1)
y = df['Culture_num']

In [13]:
print(x)

          Region   Production   Superficie    Rendement
0          DAKAR  1884.493889  3140.823149   764.142857
1          DAKAR    55.762016   371.746770   150.000000
2          DAKAR    80.000000   400.000000   200.000000
3          DAKAR   556.529307   894.741651   622.000000
4          DAKAR   741.750042  1211.026408   612.497000
...          ...          ...          ...          ...
2182  ZIGUINCHOR  2425.685825   938.732904  2584.000000
2183  ZIGUINCHOR  1606.500000   850.000000  1890.000000
2184  ZIGUINCHOR  1735.236137   889.000000  1951.896667
2185  ZIGUINCHOR  1585.839485   759.079449  2089.161400
2186  ZIGUINCHOR  2944.100000   998.000000  2950.000000

[2168 rows x 4 columns]


In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split


In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [16]:
#transformation des variables

ohe = OneHotEncoder(drop='first')    #pour encoder des variables nominales non ordinales
scale = StandardScaler()             #Utilis√© pour normaliser les variables

preprocesser4 = ColumnTransformer(
        transformers = [
            ('StandardScale', scale, [1, 2, 3]),
            ('OHE', ohe, [0]),
        ],
        remainder='passthrough'
)

In [17]:
x_train_encoder = preprocesser4.fit_transform(x_train)
x_test_encoder = preprocesser4.transform(x_test)

In [18]:
#entrainement des differents modeles


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# creation dune instance pr chaque modele
models = {
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
}


for name, md in models.items():
    md.fit(x_train_encoder,y_train)
    y_pred = md.predict(x_test_encoder)
    
    print(f"{name}  with accuracy : {accuracy_score(y_test,y_pred)}")

K-Nearest Neighbors  with accuracy : 0.7073732718894009
Decision Tree  with accuracy : 0.6658986175115207
Random Forest  with accuracy : 0.771889400921659


In [19]:
print(y_test)

1505    4
611     2
2006    3
1731    1
1532    6
       ..
446     2
1341    2
1260    3
20      2
1851    7
Name: Culture_num, Length: 434, dtype: int64


In [20]:
#on fais le choix du random forest classifier

rdfc = RandomForestClassifier()
rdfc.fit(x_train_encoder,y_train)
y_pred = rdfc.predict(x_test_encoder)
accuracy_score(y_test,y_pred)

0.7857142857142857

In [21]:
# test de la prediction avec de nouvelles valeures

X = np.array([["DAKAR", 12000, 10 , 200]])

In [22]:
print(X)

[['DAKAR' '12000' '10' '200']]


In [23]:
X_essai_rdfc= preprocesser4.transform(X)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [24]:
y_essai_rdfc=rdfc.predict(X_essai_rdfc)

In [25]:
print(y_essai_rdfc)

[4]


In [26]:
#sauvegarde du modele dans un fichier joblib

import joblib


In [27]:

#creer un dictionnaire 

data = {"model": rdfc, "encoder": preprocesser4}

#ouverture du fichier pickle contenant le dictionnaire
with open('saved_file_recommandation.joblib', 'wb') as file:
        joblib.dump(data, file)

In [28]:
#verification de la sauvegarde

with open('saved_file_recommandation.joblib', 'rb') as file:
    data = joblib.load(file)
    
rdfc_loaded = data["model"]    
preprocesser4 = data["encoder"]

In [29]:
#refaire la prediction
y_pred_reboot_rdfc = rdfc_loaded.predict(X_essai_rdfc)
print(y_pred_reboot_rdfc)

[4]
