In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Cosa fare
- Assicurarsi della qualità dei dati:
    - Controllare che le variabili siano state importate correttamente
    - Controllare che non ci siano valori mancanti.
        - Gestire i valori mancanti con rimozione
        - Gestire i valori mancanti con imputazione
    - Controllare la consistenza dei dati (sono nel formato giusto)
    - Controllo duplicati
- Analisi Esplorativa
    - Macrodescrittive per le problematiche evidenti
    - Matrice di correlazione
        - evidenziare anomalie o risultati inaspettati
        - evidenziare variabili fortemente correlate che potrebbero causare collinearità
    - Distribuzione delle variabili
        - Grafico incrociato con pairplot di seabor
        - Visualizzare le distribuzioni in maniera dettagliata:
            - Violin plot
            - Histogram
            - Box plot
        - Approfondire su variabili più importanti
        - Analisi dettagliate per le variabili qualitative (studiare)

In [None]:
# Load the data
df = pd.read_csv('../data/mxmh_survey_results.csv')


In [7]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Timestamp                     736 non-null    object 
 1   Age                           735 non-null    float64
 2   Primary streaming service     735 non-null    object 
 3   Hours per day                 736 non-null    float64
 4   While working                 733 non-null    object 
 5   Instrumentalist               732 non-null    object 
 6   Composer                      735 non-null    object 
 7   Fav genre                     736 non-null    object 
 8   Exploratory                   736 non-null    object 
 9   Foreign languages             732 non-null    object 
 10  BPM                           629 non-null    float64
 11  Frequency [Classical]         736 non-null    object 
 12  Frequency [Country]           736 non-null    object 
 13  Frequ

In [19]:
df.describe()


Unnamed: 0,Age,Hours per day,BPM,Anxiety,Depression,Insomnia,OCD,Music effects enc
count,736.0,736.0,736.0,736.0,736.0,736.0,736.0,736.0
mean,25.206803,3.572758,1589948.0,5.837636,4.796196,3.738451,2.637228,1.724185
std,12.046766,3.028199,36856220.0,2.793054,3.02887,3.088689,2.842017,0.496257
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,2.0,105.0,4.0,2.0,1.0,0.0,1.0
50%,21.0,3.0,128.0,6.0,5.0,3.0,2.0,2.0
75%,28.0,5.0,163.0,8.0,7.0,6.0,5.0,2.0
max,89.0,24.0,1000000000.0,10.0,10.0,10.0,10.0,2.0


In [16]:
righe_null = df[df.isna().any(axis=1)]
righe_null.shape #Show dimension of the dataframe with null values



(120, 34)

In [17]:
# Substitute NaN with the mean value in the numerical columns
df.fillna({col: df[col].mean() for col in df.select_dtypes(include=['number']).columns}, inplace=True)

# Substitute NaN with the mode value in the object columns
df.fillna({col: df[col].mode()[0] for col in df.select_dtypes(include=['object']).columns}, inplace=True)

In [18]:
effects = df['Music effects'].value_counts().index

# Create a dictionary to map the values to integers
def map_effects(effect) :
    if effect == effects[0] :
        return 2
    elif effect == effects[1] :
        return 1
    elif effect == effects[2] :
        return 0
    else :
        return 

# Create a new column with the mapped values
df['Music effects enc'] = df['Music effects'].map(lambda effect : map_effects(effect))



In [None]:
#Check for duplicates
df.duplicated().sum() 


np.int64(0)