In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
file_path = "C:/Users/asus/Desktop/question_one_data.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')
sns.set(style="whitegrid")
save_path = "C:/Users/asus/Desktop/plots/"
os.makedirs(save_path, exist_ok=True)

# Display basic information about the dataset
df.info(), df.head()
# Display basic information about the dataset
print("Shape of data:")
print(df.shape)

print("\nDataset Information:")
print(df.info())


print("\nMissing Values:")
print(df.isnull().sum())


print("\nMNumber of duplicate rows:")
print(df.duplicated().sum())


print("\nSummary Statistics:")
print(df.describe())

df.rename(columns={
    "Age (Ma)": "Age_Ma",
    "Max (Diameter) (µm)": "Max_Diameter",
    "Min (Diameter) (µm)": "Min_Diameter",
    "Elongation": "Elongation"
}, inplace=True)


df_sorted = df.sort_values(by="Age_Ma", ascending=True).reset_index(drop=True)

df_sorted["Age_Group"] = pd.qcut(df_sorted["Age_Ma"], q=5, labels=["0.01-0.62MA", "0.63-1.31MA", "1.31-1.93MA", "1.93-2.76MA", "2.77-5.00MA"])

# Distribution of numerical variables
selected_cols = df.select_dtypes(include=[np.number]).columns.tolist()


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# Histogram + KDE (data distribution)
for col in selected_cols:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"histogram of the distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("num")
    plt.savefig(f"{save_path}histogram_{col}.png", dpi=300)
    plt.show()


# Compute min, max, and interval for each Age_Group
age_label = df_sorted.groupby("Age_Group")["Age_Ma"].agg(["min", "max"])
age_label["Interval"] = age_label["max"] - age_label["min"]

print("Labeled Age Group Time Intervals (Unit: Million Years, Ma):\n")
for group, row in age_label.iterrows():
    print(f"{group}: {row['min']:.6f} Ma → {row['max']:.6f} Ma (Interval: {row['Interval']:.6f} Ma)")


features = ["Max_Diameter", "Min_Diameter", "Elongation"]
titles = ["Max Diameter", "Min Diameter", "Elongation"]

selected_cols = ["Max_Diameter", "Min_Diameter", "Elongation"]

# count outliers for "Max_Diameter", "Min_Diameter", "Elongation"
def detect_outliers_iqr(df, cols, factor=1.5):
    outliers_info = {}
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        outlier_mask = (df[col] < lower_bound) | (df[col] > upper_bound)

        outliers_info[col] = {
            "lower_bound": lower_bound,
            "upper_bound": upper_bound,
            "count": outlier_mask.sum(),
            "outliers": df[outlier_mask]
        }
    return outliers_info

iqr_outliers = detect_outliers_iqr(df, selected_cols, factor=1.5)

for col, info in iqr_outliers.items():
    print(f"Column {col}:")
    print(f"Lower bound: {info['lower_bound']:.2f}, Upper bound: {info['upper_bound']:.2f}")
    print(f"Number of outliers: {info['count']}")
    print(f"Example outliers:\n{info['outliers'].head()}\n")

for feature, title in zip(features, titles):
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df_sorted, x="Age_Group", y=feature, order=["0.01-0.62MA", "0.63-1.31MA", "1.31-1.93MA", "1.93-2.76MA", "2.77-5.00MA"])
    plt.title(f"Box Plot: {title} Across Age Groups (Sorted by Time)")
    plt.xlabel("Age Group (Ordered by Time)")
    plt.ylabel(title)
    plt.savefig(f"{save_path}BoxPlot_{title}.png", dpi=300)
    plt.show()


for feature, title in zip(features, titles):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df_sorted, x="Age_Ma", y=feature, alpha=0.6)
    plt.title(f"Scatter Plot: Age vs {title}")
    plt.xlabel("Age (Ma)")
    plt.ylabel(title)
    plt.savefig(f"{save_path}ScatterPlot_{title}.png", dpi=300)
    plt.show()

#
for feature, title in zip(features, titles):
    plt.figure(figsize=(8, 6))
    sns.lineplot(data=df_sorted, x="Age_Ma", y=feature, marker="o")
    plt.title(f"Line Plot: Age vs {title}")
    plt.xlabel("Age (Ma)")
    plt.ylabel(title)
    plt.gca().invert_xaxis()
    plt.savefig(f"{save_path}LinePlot_{title}.png", dpi=300)
    plt.show()


# Calculate the correlation matrix, eg If Age_Ma is negatively correlated to Max_Diameter,
# it means that the organism is getting smaller over time.
cols = ["Age_Ma", "Max_Diameter", "Min_Diameter", "Elongation"]
df_sorted[cols] = df_sorted[cols].apply(pd.to_numeric, errors='coerce')

correlation_matrix = df_sorted[cols].corr()
correlation_matrix1=np.array(correlation_matrix)
print(correlation_matrix)
print(correlation_matrix.shape)
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, square=True, annot=True, fmt=".2f", annot_kws={"color":"black"},  cmap="RdYlBu", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.savefig(f"{save_path}Correlation Heatmap.png", dpi=300)
plt.show()







