# PART 1 : Artificial dataset generation

The goal of this exercise is to work with statistical notions such as mean, standard
deviation, and correlation.
Write a file or a notebook that generates a numerical dataset with 300 datapoints (i.e. lines) and at least 6 columns and saves it to a csv file named artificial_dataset.csv.

The columns must satisfy the following requirements :
- They must all have a different mean
- They must all have a different standard deviation (English for "écart type")
- At least one column should contain integers.
- At least one column should contain floats.
- One column must have a mean close to 2.5.
- Some columns must be positively correlated.
- Some columns must be negatively correlated.
- Some columns must have a correlation close to 0.

In [1]:
import numpy as np
import pandas as pd

np.random.seed(2804)

def create_dataframe(nb_rows=300, nb_cols=6):
    df = pd.DataFrame()

    int_cols = 0
    for x in range(nb_cols):
        # Create a different mean and standard deviation for each col
        mean = np.random.uniform(low=0, high=10)
        std_dev = np.random.uniform(low=0.5, high=5)

        # Making sure there is at least 1 int col and 1 float col
        t = np.random.choice(['int', 'float'])
        if int_cols == 0 and x == nb_cols - 1:
            t = 'int'
        if int_cols == nb_cols - 1:
            t = 'float'
        int_cols += (t == 'int')

        # Generate int and float cols and applying standard deviation and mean to the col
        if t == 'int':
            data = np.random.randint(0, 100, size=nb_rows)
            df[f'Col_{x+1}'] = (data * std_dev + mean).astype(int)
        else:
            data = np.random.uniform(0, 100, size=nb_rows)
            df[f'Col_{x+1}'] = data * std_dev + mean
    # Col with mean close to 2.5
    df[f'Col_{nb_cols+1}'] = np.random.normal(2.5, 0.1, size=nb_rows)
    # Col positively correlated
    df[f'Col_{nb_cols+2}'] = df['Col_1'] + np.random.normal(0, 2, size=nb_rows)
    # Col negatively correlated
    df[f'Col_{nb_cols+3}'] = -df['Col_1'] + np.random.normal(0, 2, size=nb_rows)
    # Col with corelation close to 0
    df[f'Col_{nb_cols+4}'] = np.random.normal(0, 1, size=nb_rows)

    df.to_csv('positively_correlated_dataset.csv', index=False)

create_dataframe()