In [2]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the data
df = pd.read_csv('Football Teams Price.csv')

# Convert Price to numeric, removing commas and quotes if necessary
df['Price'] = pd.to_numeric(df['Price'].replace('[\$,"]', '', regex=True), errors='coerce')

# Function to calculate correlation with target
def correlation_with_target(df, target='Price'):
    correlations = df.corr()[target].sort_values(ascending=False)
    return correlations

# Function to perform chi-square test for categorical variables
def chi_square_test(df, categorical_col, target='Price'):
    contingency_table = pd.crosstab(df[categorical_col], pd.cut(df[target], bins=5))
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    return p_value

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Ensure 'Price' is in numerical_cols and not in categorical_cols
if 'Price' in categorical_cols:
    categorical_cols.remove('Price')
if 'Price' not in numerical_cols:
    numerical_cols.append('Price')

# Basic statistics
print("Basic Statistics for Numerical Variables:")
print(df[numerical_cols].describe().to_string())
print("\n")

# Correlation with target for numerical variables
print("Correlation with Price for Numerical Variables:")
correlations = correlation_with_target(df[numerical_cols])
print(correlations.to_string())
print("\n")

# Chi-square test for categorical variables
print("Chi-square Test P-values for Categorical Variables:")
for col in categorical_cols:
    p_value = chi_square_test(df, col)
    print(f"{col}: {p_value}")
print("\n")

# Check for missing values
print("Missing Values:")
print(df.isnull().sum().to_string())
print("\n")

# Check for outliers in numerical columns (using IQR method)
print("Outliers in Numerical Columns (count of values beyond 1.5 IQR):")
for col in numerical_cols:
    if col != 'Price':  # Exclude Price from outlier detection
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
        print(f"{col}: {outliers}")
print("\n")

# Distribution of categorical variables
print("Distribution of Categorical Variables:")
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts(normalize=True).to_string())
print("\n")

# Potential feature interactions (example with top correlated numerical features)
top_features = correlations.head(6).index.tolist()  # Include top 6 to ensure we have 5 excluding Price
top_features = [f for f in top_features if f != 'Price'][:5]  # Exclude Price and take top 5
print("Potential Feature Interactions (correlation with Price):")
for i in range(len(top_features)):
    for j in range(i+1, len(top_features)):
        interaction = df[top_features[i]] * df[top_features[j]]
        correlation = np.corrcoef(interaction, df['Price'])[0, 1]
        print(f"{top_features[i]} * {top_features[j]}: {correlation}")

FileNotFoundError: [Errno 2] No such file or directory: 'Football Teams Price.csv'