# Practical Statistics for Data Scientists (Python)
# Chapter 1. Exploratory Data Analysis

Import required Python packages.

In [2]:
%matplotlib inline

from pathlib import Path

import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import trim_mean
from statsmodels import robust
import wquantiles

import seaborn as sns
import matplotlib.pylab as plt

print("Imports Done!")

Imports Done!


Define paths to data sets. If you don't keep your data in the same directory as the code, adapt the path names.

In [3]:
DATA = "~/PycharmProjects/Datasets/"
mtcar_path = DATA + "mtcars.csv"
mtcar = pd.read_csv(mtcar_path)

In [None]:
mtcar.head()

In [None]:
mtcar.shape

In [None]:
mtcar.describe()

In [None]:
mtcar_df = mtcar.copy()

In [None]:
mtcar_df = mtcar_df.rename(columns={"Unnamed: 0" : "Car Model"})

In [None]:
mtcar_df["vs"] = mtcar_df["vs"].replace({0 : "V", 1 : "S"})

In [None]:
mtcar_df["am"] = mtcar_df["am"].replace({0 : "Auto", 1 : "Manual"})

In [None]:
result = mtcar_df["qsec"][(mtcar_df["cyl"] != 3) & (mtcar_df["mpg"] > 20)].mean()
result

In [None]:
mtcar_df.groupby(["vs", "am"]).agg({"hp" : "mean"})

In [None]:
mtcar_df.groupby("am").agg("median")

In [None]:
mtcar_df.groupby("am").agg("median")

In [None]:
mtcar_df.groupby(["am", "vs"]).agg("std").iloc[:,[0,2]]

In [None]:
my_stats = mtcar_df.groupby(["am", "vs"]).agg("std").iloc[:,[0,2]]

In [None]:
descriptions_stat = mtcar_df.groupby(["am"]).agg("std").loc[:, ["hp", "disp"]]
descriptions_stat

In [None]:
round(mtcar_df.groupby(["am", "vs"]).agg({"qsec" : ["count", "min", "max", "mean", "std", "sem"]}), 2)

In [None]:
mtcar_df.isna().sum()

Dealing with NA 

In [None]:
# mtcar_df.head(15)

In [None]:
# mtcar_df["mpg"].mean()

In [None]:
# mtcar_df["mpg"].iloc[1:10] = None

In [None]:
# mtcar_df.head(15)

In [None]:
# mtcar_df["mpg"].mean()

In [None]:
sns.histplot(mtcar_df["mpg"])

In [None]:
ax = sns.boxplot(x="am", y="mpg", data=mtcar_df)

In [None]:
sns.scatterplot(data=mtcar_df, x="mpg", y="hp", hue="vs", size= "qsec")

In [None]:
sns.scatterplot(data=mtcar_df, x="mpg", y="disp", hue="hp")

In [None]:
sns.displot(data=mtcar_df, x="mpg", hue="am", kde=True)

In [None]:
mtcar_df.head()

In [None]:
d = mtcar_df.groupby(["vs", "am"])["am"].count().unstack()
d

In [None]:
p_value = scipy.stats.fisher_exact(d)[1]

In [7]:
# Предполагается, что Вы уже скачали необходимый датасет и знаете путь до него
import pandas as pd
import scipy.stats

mtcar_path = PATH_TO_DATA + "mtcars.csv"
mtcar = pd.read_csv(mtcar_path)
d = mtcar.groupby(["vs", "am"])["am"].count().unstack()
p_value = scipy.stats.fisher_exact(d)[1]

NameError: name 'PATH_TO_DATA' is not defined

### NEW DATASET 

In [None]:
DATA = "~/PycharmProjects/Datasets/"
airquality_path = DATA + "airquality.csv"
airquality = pd.read_csv(airquality_path)

In [None]:
airquality = airquality.drop("Unnamed: 0", axis=1)

In [None]:
airquality.head(10)

In [None]:
airquality_df = airquality.copy()

In [None]:
airquality_df.loc[airquality_df.Month >= 7]
# ИЛИ
airquality_df.query("Month >= 7")

In [None]:
aq_ss = airquality_df.query("Month >= 7")

In [None]:
aq_ss.groupby("Month").agg({"Ozone" : "count"})
# ИЛИ 
# airquality_df.query("Month >= 7").groupby("Month").agg({"Ozone" : "count"})

In [None]:
sns.boxplot(x="Month", y="Ozone", data=airquality_df)

### NEW DATASET

In [None]:
DATA = "~/PycharmProjects/Datasets/"
iris_path = DATA + "iris.csv"
iris = pd.read_csv(iris_path)

In [None]:
iris = iris.drop("Unnamed: 0", axis=1)

In [None]:
iris.head(10)

In [None]:
iris.describe()

In [None]:
iris.agg("std")

In [None]:
iris[iris.Species == "virginica"].median().sort_values(ascending=False)

In [None]:
sns.displot(data=iris, x="Sepal.Length", hue="Species", multiple="stack")

## NEW CHAPTER AND NEW DATASET

In [None]:
DATA = "/home/aleksey/PycharmProjects/Datasets/"
grants_path = DATA + "grants.csv"
grants = pd.read_csv(grants_path)

In [None]:
grants.head()

In [None]:
grants.shape

In [None]:
grants.isna().sum()

In [None]:
grants.describe()

In [None]:
grants_df = grants.copy()

In [None]:
grants_df["status"] = grants_df["status"].replace({0 : "Not funded", 1 : "Funded"})

In [None]:
grants_df["status"].unique()

In [None]:
grants_df.head(3)

In [None]:
table_1 = pd.pivot_table(grants_df, index="status", values="field", aggfunc="count")
table_1

In [None]:
table_2 = pd.pivot_table(grants_df, index=["field"], columns=["status"], aggfunc="count")
table_2

In [None]:
# Неудачные попытки
# -----------------------------
# grants_df.groupby(["field", "status"]).agg({"status" : "count"}).unstack()
# -----------------------------
# table_2 = grants_df.pivot_table(index="status", columns=["field"], aggfunc="count")
# table_2

In [None]:
dist = grants_df.groupby(["field", "status"])["status"].count().unstack()
dist

In [None]:
# Binom test
p_value_1 = scipy.stats.binom_test(table_1)
p_value_1

In [None]:
# ChiSquare test
cs_res_1 = scipy.stats.chisquare(table_1)
print(cs_res_1)
print("The chi-squared test statistic is {}; \nThe p-value of the test is {}.".format(cs_res_1[0], cs_res_1[1]))

In [None]:
cs_res_2 = scipy.stats.chisquare(dist.T)
cs_res_2

### New Dataset

In [None]:
DATA = "~/PycharmProjects/Datasets/"
hec_path = DATA + "HairEyeColor.csv"
hec = pd.read_csv(hec_path)

In [None]:
hec = hec.drop("Unnamed: 0", axis=1)

In [None]:
hec.head()

In [None]:
hec_fem = hec[hec.Sex == "Female"]
hec_fem

In [None]:
# Неудачные попытки
# sns.histplot(data=hec_fem, x="Hair", y="Freq")

In [None]:
sns.catplot(data=hec_fem, x="Hair", y="Freq", hue="Eye", kind="bar")

In [None]:
# Как сделать хорошо и правильно? - Посмотри туториалы.
# titanic = sns.load_dataset("titanic")
# sns.catplot(x="sex", y="survived", hue="class", kind="bar", data=titanic)

In [None]:
hec_fem[hec_fem.Hair == "Brown"].iloc[:,[1, 3]]

In [None]:
brown_haired_ladies = hec_fem[hec_fem.Hair == "Brown"].iloc[:,[1, 3]].set_index("Eye")
brown_haired_ladies

In [None]:
cs_res_3 = scipy.stats.chisquare(brown_haired_ladies)
print(cs_res_3)
print("The chi-squared test statistic is {}; \nThe p-value of the test is {}.".format(cs_res_3[0], cs_res_3[1]))

In [None]:
DATA = "~/PycharmProjects/Datasets/"
hec_path = DATA + "HairEyeColor.csv"
hec = pd.read_csv(hec_path, index_col=0)
brown_haired_ladies = hec.query("Sex == 'Female' and Hair == 'Brown'").iloc[:,[1, 3]].set_index("Eye")
chisquare_result = scipy.stats.chisquare(brown_haired_ladies)
print("The chi-squared test statistic is {}; \nThe p-value of the test is {}.".format(chisquare_result[0], chisquare_result[1]))

### New Dataset

In [None]:
diamonds = sns.load_dataset('diamonds')
diamonds.head()

In [None]:
diamonds = diamonds.sort_values(by=['color', 'cut'])

In [None]:
sns.histplot(data=diamonds, x="color", hue="cut", multiple="dodge", shrink=.8)

In [None]:
# import seaborn as sns

# diamonds = sns.load_dataset('diamonds')
# diamonds = diamonds.sort_values(by=['color', 'cut'])
# sns.histplot(data=diamonds, x="color", hue="cut", multiple="dodge", shrink=.7)

In [None]:
diamonds = sns.load_dataset('diamonds')
diamonds = pd.DataFrame(diamonds)
# d = diamonds.groupby(['cut', 'color']).agg({"color" : "count"}).unstack()
d = diamonds.groupby(['cut', 'color'])['cut'].count().unstack()
print(d.shape)
d

In [None]:
# НЕПРАВИЛЬНО!

# chisquare_result = scipy.stats.chisquare(d, axis=None)
# chisquare_result
# -------------------------------------------------------------
# ПРАВИЛЬНО!
chisquare_result = scipy.stats.chi2_contingency(d)
chisquare_result

In [None]:
# # Предполагается, что Вы уже скачали необходимый датасет и знаете путь до него
# import pandas as pd
# import scipy.stats
# import seaborn as sns

# diamonds = sns.load_dataset('diamonds')
# diamonds = pd.DataFrame(diamonds)
# d = diamonds.groupby(['cut', 'color'])['cut'].count().unstack()
# chisquare_result = scipy.stats.chi2_contingency(d)
# chisquare_result

In [None]:
diamonds["factor_price"] = np.where(diamonds['price'] >= diamonds.price.mean(), 1, 0)
diamonds["factor_carat"] = np.where(diamonds['carat'] >= diamonds.carat.mean(), 1, 0)

In [None]:
d = diamonds.groupby(['factor_carat', 'factor_price'])['factor_price'].count().unstack()
d

In [None]:
chisquare_result = scipy.stats.chi2_contingency(d)
chisquare_result[0]

In [None]:
# # Предполагается, что Вы уже скачали необходимый датасет и знаете путь до него
# import pandas as pd
# import numpy as np
# import scipy.stats
# import seaborn as sns

# diamonds = sns.load_dataset('diamonds')
# diamonds = pd.DataFrame(diamonds)
# diamonds["factor_price"] = np.where(diamonds['price'] >= diamonds.price.mean(), 1, 0)
# diamonds["factor_carat"] = np.where(diamonds['carat'] >= diamonds.carat.mean(), 1, 0)
# d = diamonds.groupby(['factor_carat', 'factor_price'])['factor_price'].count().unstack()
# chisquare_result = scipy.stats.chi2_contingency(d)
# chisquare_result[0]