In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('amcat.csv')
data.head()

In [None]:
data.describe()

Univariate Analysis

1 Histogram

In [None]:
cols = data.select_dtypes('float64','int64').columns
cols

In [None]:
for col in cols:
    plt.figure(figsize=(10, 4))
    sns.histplot(data[col], kde=True)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()
    # Observations
    print(f"Observations for {col}:")
    print(f"Mean: {data[col].mean()}, Median: {data[col].median()}, Std: {data[col].std()}")
    print("Skewness:", data[col].skew())
    print("Kurtosis:", data[col].kurtosis())
    print("\n")

2 Boxplots

In [None]:
for col in cols:
    plt.figure(figsize=(10, 4))
    sns.boxplot(x=data[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

    # Identify Outliers
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    print(f"Outliers in {col}:")
    print(outliers)
    print("\n")

3. Categorical Columns Analysis

In [None]:

categorical_cols = data.select_dtypes(include=['object']).columns
print(categorical_cols)

# Countplots
for col in categorical_cols:
    plt.figure(figsize=(10, 4))
    sns.countplot(data=data, x=col)
    plt.title(f'Countplot of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

    # Frequency Distribution
    frequency = data[col].value_counts()
    print(f"Frequency distribution for {col}:")
    print(frequency)
    print("\n")

Bivariate Analysis

1 Scatter Plots

In [None]:

for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=data, x=cols[i], y=cols[j])
        plt.title(f'Scatter Plot: {cols[i]} vs {cols[j]}')
        plt.xlabel(cols[i])
        plt.ylabel(cols[j])
        plt.show()

        # Observations
        correlation = data[cols[i]].corr(data[cols[j]])
        print(f"Correlation between {cols[i]} and {cols[j]}: {correlation}\n")


In [None]:
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        plt.figure(figsize=(10, 6))
        plt.hexbin(data[cols[i]], data[cols[j]], gridsize=30, cmap='Blues')
        plt.title(f'Hexbin Plot: {cols[i]} vs {cols[j]}')
        plt.xlabel(cols[i])
        plt.ylabel(cols[j])
        plt.colorbar(label='Counts')
        plt.show()

In [None]:
# c. Pair Plots
sns.pairplot(data[cols])
plt.title('Pair Plot of Numerical Columns')
plt.show()

In [None]:
# 2. Patterns Between Categorical and Numerical Columns
categorical_cols = data.select_dtypes(include=['object']).columns

# a. Swarmplots
for cat in categorical_cols:
    for num in cols:
        plt.figure(figsize=(10, 6))
        sns.swarmplot(x=data[cat], y=data[num])
        plt.title(f'Swarm Plot: {cat} vs {num}')
        plt.xlabel(cat)
        plt.ylabel(num)
        plt.xticks(rotation=45)
        plt.show()

In [None]:
# b. Boxplots
for cat in categorical_cols:
    for num in cols:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=data[cat], y=data[num])
        plt.title(f'Boxplot: {cat} vs {num}')
        plt.xlabel(cat)
        plt.ylabel(num)
        plt.xticks(rotation=45)
        plt.show()

In [None]:
# c. Barplots
for cat in categorical_cols:
    for num in cols:
        plt.figure(figsize=(10, 6))
        sns.barplot(x=data[cat], y=data[num], estimator='mean')
        plt.title(f'Bar Plot: {cat} vs {num}')
        plt.xlabel(cat)
        plt.ylabel(num)
        plt.xticks(rotation=45)
        plt.show()

In [None]:
# 3. Relationships Between Categorical Columns
for i in range(len(categorical_cols)):
    for j in range(i + 1, len(categorical_cols)):
        cross_tab = pd.crosstab(data[categorical_cols[i]], data[categorical_cols[j]])
        cross_tab.plot(kind='bar', stacked=True, figsize=(10, 6))
        plt.title(f'Stacked Bar Plot: {categorical_cols[i]} vs {categorical_cols[j]}')
        plt.xlabel(categorical_cols[i])
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.legend(title=categorical_cols[j])
        plt.show()

        # Observations
        print(f"Relationship between {categorical_cols[i]} and {categorical_cols[j]}:\n{cross_tab}\n")F