# Practical Statistics for Data Scientists (Python)
# Chapter 1. Exploratory Data Analysis

Import required Python packages.

In [None]:
%matplotlib inline

from pathlib import Path

import pandas as pd
import numpy as np
import scipy.stats
import scipy.stats
from scipy.stats import trim_mean
from scipy import stats
from statsmodels import robust
import wquantiles

import seaborn as sns
import matplotlib.pylab as plt

print("Imports Done!")

Define paths to data sets. If you don't keep your data in the same directory as the code, adapt the path names.

In [None]:
DATA = "~/PycharmProjects/Datasets/"
mtcar_path = DATA + "mtcars.csv"
mtcar = pd.read_csv(mtcar_path)

In [None]:
mtcar.head()

In [None]:
mtcar.shape

In [None]:
mtcar.describe()

In [None]:
mtcar_df = mtcar.copy()

In [None]:
mtcar_df = mtcar_df.rename(columns={"Unnamed: 0" : "Car Model"})

In [None]:
mtcar_df["vs"] = mtcar_df["vs"].replace({0 : "V", 1 : "S"})

In [None]:
mtcar_df["am"] = mtcar_df["am"].replace({0 : "Auto", 1 : "Manual"})

In [None]:
result = mtcar_df["qsec"][(mtcar_df["cyl"] != 3) & (mtcar_df["mpg"] > 20)].mean()
result

In [None]:
mtcar_df.groupby(["vs", "am"]).agg({"hp" : "mean"})

In [None]:
mtcar_df.groupby("am").agg("median")

In [None]:
mtcar_df.groupby("am").agg("median")

In [None]:
mtcar_df.groupby(["am", "vs"]).agg("std").iloc[:,[0,2]]

In [None]:
my_stats = mtcar_df.groupby(["am", "vs"]).agg("std").iloc[:,[0,2]]

In [None]:
descriptions_stat = mtcar_df.groupby(["am"]).agg("std").loc[:, ["hp", "disp"]]
descriptions_stat

In [None]:
round(mtcar_df.groupby(["am", "vs"]).agg({"qsec" : ["count", "min", "max", "mean", "std", "sem"]}), 2)

In [None]:
mtcar_df.isna().sum()

Dealing with NA 

In [None]:
# mtcar_df.head(15)

In [None]:
# mtcar_df["mpg"].mean()

In [None]:
# mtcar_df["mpg"].iloc[1:10] = None

In [None]:
# mtcar_df.head(15)

In [None]:
# mtcar_df["mpg"].mean()

In [None]:
sns.histplot(mtcar_df["mpg"])

In [None]:
ax = sns.boxplot(x="am", y="mpg", data=mtcar_df)

In [None]:
sns.scatterplot(data=mtcar_df, x="mpg", y="hp", hue="vs", size= "qsec")

In [None]:
sns.scatterplot(data=mtcar_df, x="mpg", y="disp", hue="hp")

In [None]:
sns.displot(data=mtcar_df, x="mpg", hue="am", kde=True)

In [None]:
mtcar_df.head()

In [None]:
d = mtcar_df.groupby(["vs", "am"])["am"].count().unstack()
d

In [None]:
p_value = scipy.stats.fisher_exact(d)[1]

In [None]:
# Предполагается, что Вы уже скачали необходимый датасет и знаете путь до него
# import pandas as pd
# import scipy.stats

# mtcar_path = PATH_TO_DATA + "mtcars.csv"
# mtcar = pd.read_csv(mtcar_path)
# d = mtcar.groupby(["vs", "am"])["am"].count().unstack()
# p_value = scipy.stats.fisher_exact(d)[1]

### NEW DATASET 

In [None]:
DATA = "~/PycharmProjects/Datasets/"
airquality_path = DATA + "airquality.csv"
airquality = pd.read_csv(airquality_path)

In [None]:
airquality = airquality.drop("Unnamed: 0", axis=1)

In [None]:
airquality.head(10)

In [None]:
airquality_df = airquality.copy()

In [None]:
airquality_df.loc[airquality_df.Month >= 7]
# ИЛИ
airquality_df.query("Month >= 7")

In [None]:
aq_ss = airquality_df.query("Month >= 7")

In [None]:
aq_ss.groupby("Month").agg({"Ozone" : "count"})
# ИЛИ 
# airquality_df.query("Month >= 7").groupby("Month").agg({"Ozone" : "count"})

In [None]:
sns.boxplot(x="Month", y="Ozone", data=airquality_df)

### NEW DATASET

In [None]:
DATA = "~/PycharmProjects/Datasets/"
iris_path = DATA + "iris.csv"
iris = pd.read_csv(iris_path, index_col=0)

In [None]:
iris.head(10)

In [None]:
iris.describe()

In [None]:
iris.agg("std")

In [None]:
iris[iris.Species == "virginica"].median().sort_values(ascending=False)

In [None]:
sns.displot(data=iris, x="Sepal.Length", hue="Species", multiple="stack")

In [None]:
iris_df = iris[iris.Species != "setosa"]

In [None]:
sns.histplot(iris_df, x="Sepal.Length", kde=True, hue="Species")

In [None]:
g = sns.FacetGrid(iris_df, col="Species")
g.map(sns.histplot, "Sepal.Length", kde=True, bins=5)
g.add_legend()

https://seaborn.pydata.org/generated/seaborn.FacetGrid.html#seaborn.FacetGrid - IMBA

In [None]:
sns.histplot(iris_df, x="Sepal.Length", hue="Species", element="poly") 

In [None]:
sns.displot(iris_df, x="Sepal.Length", hue="Species", kind="kde")

In [None]:
sns.kdeplot(data=iris_df, x="Sepal.Length", hue="Species", fill=True, common_norm=False, alpha=.5, linewidth=1)

In [None]:
sns.set_theme(style="darkgrid")
sns.boxplot(y="Sepal.Length", x="Species", data=iris_df)

In [None]:
# Perform the Shapiro-Wilk test for normality.
shapiro_test = stats.shapiro(iris_df["Sepal.Length"])
shapiro_test

In [None]:
shapiro_test_1 = stats.shapiro(iris_df[iris_df.Species == "versicolor"]["Sepal.Length"])
shapiro_test_2 = stats.shapiro(iris_df[iris_df.Species == "virginica"]["Sepal.Length"])
[shapiro_test_1, shapiro_test_2]

In [None]:
# Perform Bartlett’s test for equal variances.
# Гомогенность дисперсии проверяем, короче.
stat, p = scipy.stats.bartlett(iris_df[iris_df.Species == "virginica"]["Sepal.Length"], 
                               iris_df[iris_df.Species == "versicolor"]["Sepal.Length"])
print("The test statistic is {}. \nThe p-value of the test is {}.".format(stat, p))

In [None]:
# Calculate the T-test for the means of two independent samples of scores.
t_test = scipy.stats.ttest_ind(iris_df[iris_df.Species == "virginica"]["Sepal.Length"], 
                               iris_df[iris_df.Species == "versicolor"]["Sepal.Length"]) #, equal_var=True
t_test

In [None]:
# Проверяем гипотезу о том, что среднее значение длины чашелистика в генеральной совокупности (датасете) равно 8.
# Для этого используем одновыборочный Т-тест. 
t_test = scipy.stats.ttest_1samp(iris_df["Sepal.Length"], 8) 
t_test

Как и следовало ожидать - нет, среднее значение "Sepal.Length" не равно 8.

In [None]:
import numpy as np
import scipy.stats

# Самописная фукнция с просторов интернета, которая вычисляет доверительный интервал. Полезно!
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [None]:
mean_confidence_interval(iris_df["Sepal.Length"])

In [None]:
# Расчёт доверительных интервалов
import numpy as np, scipy.stats as st

a = iris_df[iris_df.Species == "versicolor"]["Sepal.Length"]
b = iris_df[iris_df.Species == "virginica"]["Sepal.Length"]

print(st.t.interval(0.95, len(a)-1, loc=np.mean(a), scale=st.sem(a)),
      st.t.interval(0.95, len(b)-1, loc=np.mean(b), scale=st.sem(b)))
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.t.html?highlight=scipy%20stats%20t#scipy.stats.t

In [None]:
print(mean_confidence_interval(a),
      mean_confidence_interval(b))

Вывод - самописная функция повторяет метод .interval из библиотеки scipy.

Отлично!

In [None]:
print(a.quantile([0.05, 0.95]), "\n\n",
      b.quantile([0.05, 0.95]))
print("\n\n")
print(iris_df["Sepal.Length"].quantile([0.05, 0.95]))

Итого - метод квантиль какой-то странный. Что-то тут не так, очевидно. Лучше его не использовать. 

Надо бы разобраться, почему такая разница.

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))

stats.probplot(iris_df["Sepal.Length"], plot=ax)

plt.tight_layout()
plt.show()

In [None]:
# Парный Т-тест
pt_test = scipy.stats.ttest_rel(iris_df["Sepal.Length"], iris_df["Sepal.Width"])
pt_test

### NEW DATASET

In [None]:
DATA = "~/PycharmProjects/Datasets/"
tooth_path = DATA + "ToothGrowth.csv"
toothgrowth = pd.read_csv(tooth_path, index_col=0)

In [None]:
toothgrowth.head(10)

In [None]:
toothgrowth.shape

In [None]:
toothgrowth.describe()

In [None]:
subset_oj = toothgrowth[(toothgrowth.supp == "OJ") & (toothgrowth.dose == 0.5)]
subset_vc = toothgrowth[(toothgrowth.supp == "VC") & (toothgrowth.dose == 2.0)]

In [None]:
t_test = scipy.stats.ttest_ind(subset_oj.len, subset_vc.len) 
t_test.statistic

In [None]:
# # Предполагается, что Вы уже скачали необходимый датасет и знаете путь до него
# import pandas as pd
# import scipy.stats

# tooth_path = PATH_TO_DATA + "ToothGrowth.csv"
# toothgrowth = pd.read_csv(tooth_path, index_col=0)
# t_test = scipy.stats.ttest_ind(toothgrowth[(toothgrowth.supp == "OJ") & (toothgrowth.dose == 0.5)].len, 
#                                toothgrowth[(toothgrowth.supp == "VC") & (toothgrowth.dose == 2.0)].len) 
# print(t_test.statistic)

### NEW DATASET

In [None]:
DATA = "~/PycharmProjects/Datasets/"
lec_path = DATA + "lekarstva.csv"
lekarstva = pd.read_csv(lec_path, index_col=0)

In [None]:
print(lekarstva.shape)
lekarstva.head()

In [None]:
lekarstva.describe()

In [None]:
lec_pair_t_test = scipy.stats.ttest_rel(lekarstva.Pressure_after, lekarstva.Pressure_before) 
lec_pair_t_test.statistic

In [1]:
# Предполагается, что Вы уже скачали необходимый датасет и знаете путь до него
import pandas as pd
import scipy.stats

lec_path = PATH_TO_DATA + "lekarstva.csv"
lekarstva = pd.read_csv(lec_path, index_col=0)
lec_pair_t_test = scipy.stats.ttest_rel(lekarstva.Pressure_before, lekarstva.Pressure_after) 
print(lec_pair_t_test.statistic)

14.226730711863198


## NEW CHAPTER AND NEW DATASET

In [None]:
DATA = "~/PycharmProjects/Datasets/"
grants_path = DATA + "grants.csv"
grants = pd.read_csv(grants_path)

In [None]:
grants.head()

In [None]:
grants.shape

In [None]:
grants.isna().sum()

In [None]:
grants.describe()

In [None]:
grants_df = grants.copy()

In [None]:
grants_df["status"] = grants_df["status"].replace({0 : "Not funded", 1 : "Funded"})

In [None]:
grants_df["status"].unique()

In [None]:
grants_df.head(3)

In [None]:
table_1 = pd.pivot_table(grants_df, index="status", values="field", aggfunc="count")
table_1

In [None]:
table_2 = pd.pivot_table(grants_df, index=["field"], columns=["status"], aggfunc="count")
table_2

In [None]:
# Неудачные попытки
# -----------------------------
# grants_df.groupby(["field", "status"]).agg({"status" : "count"}).unstack()
# -----------------------------
# table_2 = grants_df.pivot_table(index="status", columns=["field"], aggfunc="count")
# table_2

In [None]:
dist = grants_df.groupby(["field", "status"])["status"].count().unstack()
dist

In [None]:
# Binom test
p_value_1 = scipy.stats.binom_test(table_1)
p_value_1

In [None]:
# ChiSquare test
cs_res_1 = scipy.stats.chisquare(table_1)
print(cs_res_1)
print("The chi-squared test statistic is {}; \nThe p-value of the test is {}.".format(cs_res_1[0], cs_res_1[1]))

In [None]:
cs_res_2 = scipy.stats.chisquare(dist.T)
cs_res_2

### New Dataset

In [None]:
DATA = "~/PycharmProjects/Datasets/"
hec_path = DATA + "HairEyeColor.csv"
hec = pd.read_csv(hec_path)

In [None]:
hec = hec.drop("Unnamed: 0", axis=1)

In [None]:
hec.head()

In [None]:
hec_fem = hec[hec.Sex == "Female"]
hec_fem

In [None]:
# Неудачные попытки
# sns.histplot(data=hec_fem, x="Hair", y="Freq")

In [None]:
sns.catplot(data=hec_fem, x="Hair", y="Freq", hue="Eye", kind="bar")

In [None]:
# Как сделать хорошо и правильно? - Посмотри туториалы.
# titanic = sns.load_dataset("titanic")
# sns.catplot(x="sex", y="survived", hue="class", kind="bar", data=titanic)

In [None]:
hec_fem[hec_fem.Hair == "Brown"].iloc[:,[1, 3]]

In [None]:
brown_haired_ladies = hec_fem[hec_fem.Hair == "Brown"].iloc[:,[1, 3]].set_index("Eye")
brown_haired_ladies

In [None]:
cs_res_3 = scipy.stats.chisquare(brown_haired_ladies)
print(cs_res_3)
print("The chi-squared test statistic is {}; \nThe p-value of the test is {}.".format(cs_res_3[0], cs_res_3[1]))

In [None]:
DATA = "~/PycharmProjects/Datasets/"
hec_path = DATA + "HairEyeColor.csv"
hec = pd.read_csv(hec_path, index_col=0)
brown_haired_ladies = hec.query("Sex == 'Female' and Hair == 'Brown'").iloc[:,[1, 3]].set_index("Eye")
chisquare_result = scipy.stats.chisquare(brown_haired_ladies)
print("The chi-squared test statistic is {}; \nThe p-value of the test is {}.".format(chisquare_result[0], chisquare_result[1]))

### New Dataset

In [None]:
diamonds = sns.load_dataset('diamonds')
diamonds.head()

In [None]:
diamonds = diamonds.sort_values(by=['color', 'cut'])

In [None]:
sns.histplot(data=diamonds, x="color", hue="cut", multiple="dodge", shrink=.8)

In [None]:
# import seaborn as sns

# diamonds = sns.load_dataset('diamonds')
# diamonds = diamonds.sort_values(by=['color', 'cut'])
# sns.histplot(data=diamonds, x="color", hue="cut", multiple="dodge", shrink=.7)

In [None]:
diamonds = sns.load_dataset('diamonds')
diamonds = pd.DataFrame(diamonds)
# d = diamonds.groupby(['cut', 'color']).agg({"color" : "count"}).unstack()
d = diamonds.groupby(['cut', 'color'])['cut'].count().unstack()
print(d.shape)
d

In [None]:
# НЕПРАВИЛЬНО!

# chisquare_result = scipy.stats.chisquare(d, axis=None)
# chisquare_result
# -------------------------------------------------------------
# ПРАВИЛЬНО!
chisquare_result = scipy.stats.chi2_contingency(d)
chisquare_result

In [None]:
# # Предполагается, что Вы уже скачали необходимый датасет и знаете путь до него
# import pandas as pd
# import scipy.stats
# import seaborn as sns

# diamonds = sns.load_dataset('diamonds')
# diamonds = pd.DataFrame(diamonds)
# d = diamonds.groupby(['cut', 'color'])['cut'].count().unstack()
# chisquare_result = scipy.stats.chi2_contingency(d)
# chisquare_result

In [None]:
# table_diamonds = pd.pivot_table(diamonds, index=["cut"], columns=["color"], aggfunc="count")
# table_diamonds

In [None]:
diamonds["factor_price"] = np.where(diamonds['price'] >= diamonds.price.mean(), 1, 0)
diamonds["factor_carat"] = np.where(diamonds['carat'] >= diamonds.carat.mean(), 1, 0)

In [None]:
d = diamonds.groupby(['factor_carat', 'factor_price'])['factor_price'].count().unstack()
d

In [None]:
chisquare_result = scipy.stats.chi2_contingency(d)
chisquare_result[0]

In [None]:
# # Предполагается, что Вы уже скачали необходимый датасет и знаете путь до него
# import pandas as pd
# import numpy as np
# import scipy.stats
# import seaborn as sns

# diamonds = sns.load_dataset('diamonds')
# diamonds = pd.DataFrame(diamonds)
# diamonds["factor_price"] = np.where(diamonds['price'] >= diamonds.price.mean(), 1, 0)
# diamonds["factor_carat"] = np.where(diamonds['carat'] >= diamonds.carat.mean(), 1, 0)
# d = diamonds.groupby(['factor_carat', 'factor_price'])['factor_price'].count().unstack()
# chisquare_result = scipy.stats.chi2_contingency(d)
# chisquare_result[0]

### Game: Megafon and N+1
#### https://nplus1.ru/material/2020/10/27/megafon

In [None]:
import re

In [None]:
PATH_TO_DATA = "~/Downloads/Megafon_and_Nplus1_Game/"
game_df_1 = pd.read_csv(PATH_TO_DATA + "first.csv", sep=";", header=None)
game_df_1.head(10)

In [None]:
game_df_1.columns = ["transaction_id", "company_name", "transaction_amount", "comment"]
game_df_1

In [None]:
type(game_df_1.iloc[:,1])

In [None]:
game_df_1[game_df_1.iloc[:,1].str.contains(r'([a-z|0-9]{8})(-[a-z|0-9]{4}){3}(-[a-z|0-9]{12})', regex=True)]

In [None]:
# PATH_TO_DATA = "~/Downloads/Megafon_and_Nplus1_Game/"

game_df_2 = pd.read_csv(PATH_TO_DATA + "dva.csv", sep=";", verbose=True)
game_df_2.head(10)