In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset with encoding fix
file_path = r"C:\Users\Anamika Pandey\Desktop\SHRUTI1\Summer-Olympic-medals-1976-to-2008.csv"

try:
    df = pd.read_csv(file_path, encoding='latin1')
except UnicodeDecodeError:
    print("Latin1 encoding failed, trying cp1252...")
    df = pd.read_csv(file_path, encoding='cp1252')

# Check the first few rows of the dataset
print("\n=== HEAD ===")
print(df.head())

# Summary of the dataset
print("\n=== INFO ===")
print(df.info())

print("\n=== DESCRIBE ===")
print(df.describe())


: 

In [None]:
 #data clean
# Check for missing values
 print(df.isnull().sum())
 # Drop rows with missing values if any
 df_cleaned = df.dropna()
 # After cleaning, check the dataset again
 print(df_cleaned.info())


In [None]:
# =============================
# Total medals won by each country
# =============================

# Group by country and count medals
medals_by_country = df_cleaned.groupby('Country')['Medal'].count().sort_values(ascending=False)

# Display result
print("\n=== Total medals by country (descending order) ===")
print(medals_by_country)

# =============================
# Plotting the top 10 countries by medal count
# =============================

plt.figure(figsize=(10, 6))
medals_by_country.head(10).plot(kind='bar', color='gold', edgecolor='black')

plt.title("Top 10 Countries by Medal Count")
plt.xlabel("Country")
plt.ylabel("Total Medals")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# =============================
# 📈 Medals Won Over the Years
# =============================

# Grouping by Year and counting the medals won
medals_over_years = df_cleaned.groupby('Year')['Medal'].count()

# Show the summary
print("\n=== Total medals won each year ===")
print(medals_over_years)

# Plotting the trend of medals won over the years
plt.figure(figsize=(12, 6))
plt.plot(
    medals_over_years.index,
    medals_over_years.values,
    marker='o',
    linestyle='--',
    color='darkorange',
    linewidth=2,
    markersize=8,
    label='Medals Count'
)

plt.title("Total Medals Won Over the Years", fontsize=16, fontweight='bold')
plt.xlabel("Year", fontsize=14, fontweight='bold')
plt.ylabel("Total Medals", fontsize=14, fontweight='bold')
plt.legend()
plt.grid(which='both', linestyle=':', linewidth=0.7)
plt.tight_layout()
plt.show()

In [None]:
# ============================
# 🎯 Distribution of Participants by Gender
# ============================

# Count number of entries per gender
gender_counts = df_cleaned['Gender'].value_counts().sort_index()

# Display counts in console
print("\n=== Gender Counts ===")
for gender, count in gender_counts.items():
    print(f"{gender}: {count}")

# Create pie chart with customizations
fig, ax = plt.subplots(figsize=(7, 5))
wedges, texts, autotexts = ax.pie(
    gender_counts,
    labels=gender_counts.index,
    autopct='%1.0f%%',
    startangle=90,
    counterclock=False,
    shadow=True,
    colors=['#f7b7a3', '#8fc7e7'],
    explode=[0.07, 0]
)

# Style adjustments

In [None]:
# ================================================
# 🏅 Top 10 Athletes with the Most Medals
# ================================================

# Group by Athlete and count the number of medals
athlete_medal_counts = (
    df_cleaned
    .groupby('Athlete')['Medal']
    .count()
    .sort_values(ascending=False)
)

# Display top 10 in console
print("\n=== Top 10 Athletes by Medal Count ===")
print(athlete_medal_counts.head(10))

# Plotting the top 10 athletes with most medals
plt.figure(figsize=(12, 7))
bars = athlete_medal_counts.head(10).plot(
    kind='bar',
    color='#c0c0c0',
    edgecolor='black'
)

plt.title("🏆 Top 10 Athletes by Total Medal Count", fontsize=16, fontweight='bold')
plt.xlabel("Athlete", fontsize=14, fontweight='bold')
plt.ylabel("Total Medals", fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', linewidth=0.5, alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd

# Ensure target column has both classes
print("Target distribution:\n", df_cleaned['Medal_Binary'].value_counts())

# Define features and target
features = ['Country_Code', 'Sport_Code', 'Gender_Code', 'Event_Gender_Code']
X = df_cleaned[features]
y = df_cleaned['Medal_Binary']

# Stratified split to preserve class ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Check class presence
if len(np.unique(y_train)) < 2:
    print("Training data contains only one class. Please check the data or class balance.")
else:
    # Train and evaluate
    model = LogisticRegression(max_iter=500)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("\n Accuracy Score:", accuracy_score(y_test, y_pred))
    print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n Classification Report:\n", classification_report(y_test, y_pred))
