# Exploratory Data Analysis

This notebook is used for performing exploratory data analysis (EDA) on the dataset. The goal of EDA is to understand the data distributions, visualize relationships, and identify patterns or anomalies in the data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('../data/raw/filtered_file.csv')
data.head()

In [2]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values[missing_values > 0]

In [3]:
# Visualize the distribution of the target variable
plt.figure(figsize=(10, 6))
sns.countplot(x='score', data=data)
plt.title('Distribution of Target Variable (Score)')
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()

In [4]:
# Visualize correlations between features
plt.figure(figsize=(12, 10))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()

In [5]:
# Pairplot to visualize relationships between features
sns.pairplot(data, hue='score')
plt.title('Pairplot of Features')
plt.show()

In [6]:
# Summary statistics of the dataset
data.describe()