In [1]:
# Importer les librairies
# Calcul, manipulation et visualisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Importer le dataset
dataset = pd.read_csv('Salary_Data.csv')

In [2]:
#Rapide résumé du dataframe: nombre de colonnes et nombre de lignes
dataset.shape

(30, 2)

In [3]:
# Connaitre le nom des colonnes
dataset.columns

list(dataset.columns.values)

['YearsExperience', 'Salary']

In [4]:
# Visualiser les premières lignes
dataset.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [5]:
# Visualiser les premières lignes
dataset.head(20)

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


In [6]:
# Type des colonnes
dataset.dtypes

YearsExperience    float64
Salary             float64
dtype: object

In [7]:
dataset_qual=dataset.select_dtypes(exclude=['object'])
# dataset_qual.drop('CustomerID', axis=1, inplace=True)
dataset_qual.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [8]:
# removing null values to avoid errors  
dataset.dropna(inplace = True)  

In [9]:
# percentile list 
perc =[.20, .40, .60, .80] 

# list of dtypes to include 
include =['object', 'float', 'int64'] 

In [10]:
# calling describe method 
desc = dataset.describe(percentiles = perc, include = include) 

In [11]:
desc

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
20%,2.98,55524.2
40%,4.0,60726.6
50%,4.7,65237.0
60%,5.54,82053.0
80%,8.3,106351.8
max,10.5,122391.0


In [12]:
# graphiques
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
plt.style.use('fivethirtyeight')
%matplotlib inline


ModuleNotFoundError: No module named 'plotly'

In [None]:
# Dummification et imputatation
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split #Partitionnement de la dataset
from sklearn.preprocessing import Imputer #Pour l'imputation des valeurs manquante
#Pas de message d'alertes
import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None #masque les avertissements de pandas 

In [None]:
dataset_qual=dataset.select_dtypes(exclude=['object'])
dataset_qual.drop('CustomerID', axis=1, inplace=True)
dataset_qual.describe().plot(kind = "area",fontsize=22, figsize = (18,8), table = True,colormap="rainbow")
plt.xlabel('',)
plt.ylabel('Value')
plt.title("Statistiques générales des variables ")

In [None]:
#Verifier les valeurs manquantes
vars_with_missing = []
for f in dataset.columns:
    missings = dataset[dataset[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/dataset.shape[0]

        print('La Variable {} a enregistré {} ({:.2%}) valeurs manquantes'.format(f, missings, missings_perc))

print('En total, il a {} variables avec des valeurs manquantes'.format(len(vars_with_missing)))


In [None]:
# description d'une variable qualitative
dataset['Genre'].describe(), dataset['Genre'].head()

In [None]:
dataset.describe().transpose()

In [None]:
# Visualiser des données : graphe Genre * Sexe
a = sns.FacetGrid( dataset, hue = 'Genre', aspect=4 )
a.map(sns.kdeplot, 'Age', shade= True )
a.set(xlim=(0 , dataset['Age'].max()))
a.add_legend()

In [None]:
#Visualiser des graphes - librairie seaborn
f,ax=plt.subplots(1,2,figsize=(16,7))
dataset['Genre'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',ax=ax[0],shadow =True)
ax[0].set_title('Cible Count')
ax[0].set_ylabel('Count')
sns.countplot('Genre',data=dataset,ax=ax[1])
ax[1].set_title('Cible Count')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.boxenplot(x='Genre',y='Age',data=dataset)
plt.subplot(2,2,2)
sns.boxenplot(x='Genre',y='Annual Income (k$)',data=dataset)
plt.subplot(2,2,3)
sns.violinplot(x='Genre',y='Spending Score (1-100)',data=dataset)

In [None]:
# Visualiser plusieurs types de représentations selon une variable cible
sns.pairplot(dataset,hue='Genre')

In [None]:
dataset.corr()

In [None]:
sns.lmplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=dataset, hue='Genre', palette='Set1', scatter_kws={'alpha':0.3}) 

In [None]:
# examiner une seule variable
print(dataset['Age'].unique())
print(dataset['Age'].value_counts(sort = True))
print(dataset['Age'].describe())