<a href="https://colab.research.google.com/github/Sri-iim/CAAI/blob/main/EDACCAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from docx import Document
from docx.shared import Inches
from scipy.stats import zscore

# Load Data
df = pd.read_excel('C:\\Users\\CHUMKI\\OneDrive\\Documents\\data.xlsx', sheet_name="Data")
df['Date'] = pd.to_datetime(df['Date'])
df.dropna(inplace=True)

In [None]:
# Style
sns.set(style="whitegrid", palette="deep")

# Distribution Plot
sns.histplot(df['FeeEuros'], kde=True, bins=20)
plt.title('Distribution of FeeEuros')
plt.tight_layout()
plt.savefig("1_distribution_feeeuros.png")
plt.close()

In [None]:
# Boxplot
sns.boxplot(x=df['FeeEuros'])
plt.title('Boxplot of FeeEuros')
plt.tight_layout()
plt.savefig("2_boxplot_feeeuros.png")
plt.close()

In [None]:
# Correlation Heatmap
corr = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig("3_correlation_heatmap.png")
plt.close()

In [None]:
# Pair Plot
sns.pairplot(df[['FeeEuros', 'YearsOfSentence', 'TrialDurationInWeeks', 'Age']])
plt.savefig("4_pairplot.png")
plt.close()

In [None]:
# FeeEuros by LawFirm
sns.boxplot(x='LawFirm', y='FeeEuros', data=df)
plt.title('FeeEuros by Law Firm')
plt.tight_layout()
plt.savefig("5_fee_by_lawfirm.png")
plt.close()

In [None]:
# Fee vs Trial Duration
sns.scatterplot(x='TrialDurationInWeeks', y='FeeEuros', hue='LawFirm', data=df)
plt.title('FeeEuros vs Trial DurationInWeeks by LawFirm')
plt.tight_layout()
plt.savefig("6_fee_vs_trial_duration.png")
plt.close()

In [None]:
# FeeEuros by LawKind
sns.boxplot(x='LawKind', y='FeeEuros', data=df)
plt.title('FeeEuros by LawKind')
plt.tight_layout()
plt.savefig("7_fee_by_lawkind.png")
plt.close()

In [None]:
# Gender by LawKind
sns.countplot(x='LawKind', hue='Gender', data=df)
plt.title('Gender Representation by LawKind')
plt.tight_layout()
plt.savefig("8_gender_by_lawkind.png")
plt.close()

In [None]:
# Z-Score for Outliers
df['z_fee'] = zscore(df['FeeEuros'])
sns.histplot(df['z_fee'], bins=30, kde=True)
plt.title('Z-Score of FeeEuros (Outlier Detection)')
plt.tight_layout()
plt.savefig("9_zscore_feeeuros.png")
plt.close()

In [None]:
# Monthly Average Fee
df['Month'] = df['Date'].dt.to_period('M')
monthly_avg = df.groupby('Month')['FeeEuros'].mean().reset_index()
monthly_avg['Month'] = monthly_avg['Month'].astype(str)
sns.lineplot(x='Month', y='FeeEuros', data=monthly_avg, marker='o')
plt.xticks(rotation=45)
plt.title('Monthly Average FeeEuros Over Time')
plt.tight_layout()
plt.savefig("10_monthly_feeeuros.png")
plt.close()

In [None]:
# Top Lawyers
top_lawyers = df.groupby('Lawyer')['FeeEuros'].mean().sort_values(ascending=False).head(10)
sns.barplot(x=top_lawyers.values, y=top_lawyers.index)
plt.title('Top 10 Lawyers by Average FeeEuros')
plt.tight_layout()
plt.savefig("11_top_lawyers.png")
plt.close()

In [None]:
# Violin Plot by Gender
sns.violinplot(x='Gender', y='FeeEuros', data=df)
plt.title('FeeEuros Distribution by Gender')
plt.tight_layout()
plt.savefig("12_violin_gender_feeeuros.png")
plt.close()

In [None]:
# Missing Value Heatmap
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.tight_layout()
plt.savefig("13_missing_values_heatmap.png")
plt.close()

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Longitude', y='Latitude', size='FeeEuros', hue='FeeEuros',
                data=df, palette='viridis', sizes=(40, 400), alpha=0.7, edgecolor='black')
plt.title('Geographical Distribution of FeeEuros')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.tight_layout()
plt.savefig("14_geo_feeeuros.png")
plt.close()


In [None]:
import folium

m = folium.Map(location=[40, 0], zoom_start=2)

for i, row in df.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=6,
        popup=f"{row['LocationCity']}, {row['FeeEuros']} EUR",
        color='blue',
        fill=True,
        fill_opacity=0.6
    ).add_to(m)

m.save("15_geo_feeeuros_map.html")

In [None]:
# --- Word Report ---
doc = Document()
doc.add_heading('Extended EDA Report: Legal Fee Dataset', 0)
doc.add_paragraph("Target Variable: FeeEuros\n")

def add_section(title, desc, img):
    doc.add_heading(title, level=1)
    doc.add_paragraph(desc)
    doc.add_picture(img, width=Inches(5.5))
add_section("1. Distribution of FeeEuros", "Right-skewed distribution with typical values 1400–1600 EUR.", "1_distribution_feeeuros.png")
add_section("2. Outlier Boxplot", "Shows extreme values possibly from long/complex trials.", "2_boxplot_feeeuros.png")
add_section("3. Correlation Heatmap", "FeeEuros strongly correlates with Trial Duration and Sentence Years.", "3_correlation_heatmap.png")
add_section("4. Pair Plot", "Clear trends across FeeEuros and legal metrics.", "4_pairplot.png")
add_section("5. Fee by LawFirm", "Firm C commands highest fees.", "5_fee_by_lawfirm.png")
add_section("6. Fee vs Trial Duration", "Longer trials often result in higher fees.", "6_fee_vs_trial_duration.png")
add_section("7. Fee by LawKind", "Fiscal and Arbitration cases yield higher returns.", "7_fee_by_lawkind.png")
add_section("8. Gender by LawKind", "Female lawyers are more prominent in Arbitration.", "8_gender_by_lawkind.png")
add_section("9. Z-Score Outliers", "Outliers detected using z-scores > 3.", "9_zscore_feeeuros.png")
add_section("10. Monthly Trend", "Slight trends visible in monthly averages.", "10_monthly_feeeuros.png")
add_section("11. Top Lawyers", "Top 10 lawyers by average case fee.", "11_top_lawyers.png")
add_section("12. Violin by Gender", "Fee distribution patterns vary slightly by gender.", "12_violin_gender_feeeuros.png")
add_section("13. Missing Value Heatmap", "Visual check for nulls.", "13_missing_values_heatmap.png")
add_section("14. Geo Plot of FeeEuros",
            "Fees are concentrated in major legal hubs. Law Firm C’s high-fee cases are often in Spain and Latin America.",
            "14_geo_feeeuros.png")
doc.add_heading("Summary & Recommendations", level=1)
doc.add_paragraph(
    "- Law Firm C excels in fee outcomes.\n"
    "- Fiscal and Arbitration law are high-fee domains.\n"
    "- Trial Duration and Sentence Years are key predictors.\n"
    "- Top-performing lawyers like Alejandra set benchmarks.\n"
    "- Monitor monthly fluctuations and address outliers.\n"
    "- Gender dynamics indicate strengths in Arbitration for female lawyers."
)
doc.save("EDA_Report_Legal_Case_Extended.docx")
print("Extended EDA and report generated successfully.")


Extended EDA and report generated successfully.


In [None]:
# Setup for saving plots
plot_dir = "plots"
os.makedirs(plot_dir, exist_ok=True)

# Create Word document
doc = Document()
doc.add_heading('Exploratory Data Analysis of Law Firms', 0)

# Add summary
doc.add_heading('Summary Statistics', level=1)
doc.add_paragraph(f"Total Records: {len(df)}")
doc.add_paragraph(f"Number of Unique Law Firms: {df['LawFirm'].nunique()}")
doc.add_paragraph(f"Law Firms: {', '.join(df['LawFirm'].unique())}")

# Helper function to save plots and add to doc
def plot_and_add(title, fig):
    filename = os.path.join(plot_dir, f"{title}.png")
    fig.savefig(filename, bbox_inches='tight')
    plt.close(fig)
    doc.add_heading(title.replace("_", " "), level=2)
    doc.add_picture(filename, width=Inches(5))
    doc.add_paragraph(f"Figure: {title.replace('_', ' ')}.")

# 1. Average Sentence Length by Law Firm
fig, ax = plt.subplots()
sns.barplot(data=df, x='LawFirm', y='YearsOfSentence', estimator=np.mean, ax=ax)
ax.set_title("Average Years of Sentence by Law Firm")
plot_and_add("Avg_Years_Of_Sentence_By_LawFirm", fig)

# 2. Gender Distribution by Law Firm
fig, ax = plt.subplots()
sns.countplot(data=df, x='LawFirm', hue='Gender', ax=ax)
ax.set_title("Gender Distribution by Law Firm")
plot_and_add("Gender_Distribution_By_LawFirm", fig)

# 3. Trial Duration by Law Firm
fig, ax = plt.subplots()
sns.boxplot(data=df, x='LawFirm', y='TrialDurationInWeeks', ax=ax)
ax.set_title("Trial Duration in Weeks by Law Firm")
plot_and_add("Trial_Duration_By_LawFirm", fig)

# 4. Fee Distribution by Law Firm
fig, ax = plt.subplots()
sns.boxplot(data=df, x='LawFirm', y='FeeEuros', ax=ax)
ax.set_title("Fee Distribution by Law Firm")
plot_and_add("Fee_Distribution_By_LawFirm", fig)

# 5. Law Kind (Case Type) Distribution by Law Firm
fig, ax = plt.subplots(figsize=(10, 6))
sns.countplot(data=df, x='LawFirm', hue='LawKind', ax=ax)
ax.set_title("LawKind Distribution by Law Firm")
plot_and_add("LawKind_Distribution_By_LawFirm", fig)

# Optional: add a correlation heatmap
numeric_cols = df.select_dtypes(include=np.number)
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm', ax=ax)
ax.set_title("Correlation Heatmap")
plot_and_add("Correlation_Heatmap", fig)

# Save the Word document
doc.save("Law_Firm_EDA_Report.docx")
print("EDA Report generated and saved as 'Law_Firm_EDA_Report.docx'")

EDA Report generated and saved as 'Law_Firm_EDA_Report.docx'
