In [19]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
url = "https://raw.githubusercontent.com/salemprakash/EDA/refs/heads/main/Data/comics.csv"
df = pd.read_csv(url)

# Display the first few rows to check the structure
print("First few rows of the DataFrame:")
print(df.head())

# Check data types and for missing values
print("\nData types and missing values:")
print(df.info())

# Strip any whitespace from column names
df.columns = df.columns.str.strip()

# Check for the presence of the 'page_views' column
if 'page_views' in df.columns:
    # Remove non-numeric characters (like commas) from 'page_views' and 'price'
    df['page_views'] = df['page_views'].str.replace(',', '', regex=True)
    df['price'] = df['price'].str.replace(',', '', regex=True)

    # Convert 'page_views' and 'price' to numeric, coercing errors to NaN
    df['page_views'] = pd.to_numeric(df['page_views'], errors='coerce')
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

    # Check the conversion results
    print("\n'page_views' and 'price' after conversion:")
    print(df[['page_views', 'price']].head())

    # Drop rows with NaN values in the relevant columns
    df.dropna(subset=['page_views', 'price', 'gender'], inplace=True)

    # Univariate Analysis
    plt.figure(figsize=(10, 6))
    sns.histplot(df['page_views'], bins=30, kde=True)
    plt.title('Distribution of Page Views')
    plt.xlabel('Page Views')
    plt.ylabel('Frequency')
    plt.show()

    # Bivariate Analysis
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='page_views', y='price', data=df, hue='gender', palette='coolwarm')
    plt.title('Scatter Plot of Page Views vs Price')
    plt.xlabel('Page Views')
    plt.ylabel('Price')
    plt.legend(title='Gender')
    plt.show()

    # Multivariate Analysis using Pairplot
    sns.pairplot(df[['page_views', 'price', 'gender']], hue='gender', palette='coolwarm')
    plt.suptitle('Pairplot of Page Views, Price, and Gender', y=1.02)
    plt.show()

    # Correlation Heatmap
    correlation_matrix = df[['page_views', 'price']].corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap between Page Views and Price')
    plt.show()

else:
    print("'page_views' column not found in the DataFrame.")


First few rows of the DataFrame:
                                  name       id    align         eye  \
0            Spider-Man (Peter Parker)   Secret     Good  Hazel Eyes   
1      Captain America (Steven Rogers)   Public     Good   Blue Eyes   
2  Wolverine (James \"Logan\" Howlett)   Public  Neutral   Blue Eyes   
3    Iron Man (Anthony \"Tony\" Stark)   Public     Good   Blue Eyes   
4                  Thor (Thor Odinson)  No Dual     Good   Blue Eyes   

         hair gender  gsm              alive  appearances first_appear  \
0  Brown Hair   Male  NaN  Living Characters       4043.0       Aug-62   
1  White Hair   Male  NaN  Living Characters       3360.0       Mar-41   
2  Black Hair   Male  NaN  Living Characters       3061.0       Oct-74   
3  Black Hair   Male  NaN  Living Characters       2961.0       Mar-63   
4  Blond Hair   Male  NaN  Living Characters       2258.0       Nov-50   

  publisher  
0    marvel  
1    marvel  
2    marvel  
3    marvel  
4    marvel  

Data