# Análise de Regressão Linear - Communities and Crime Dataset

Este notebook realiza análise de regressão linear para predizer a variável ViolentCrimesPerPop.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Carregar dados
def load_data():
    # Definir nomes das colunas baseado no arquivo .names
    columns = [
        'state', 'county', 'community', 'communityname', 'fold', 'population', 'householdsize',
        'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29',
        'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf',
        'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc',
        'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap',
        'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
        'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu',
        'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv',
        'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par',
        'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent',
        'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5',
        'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell',
        'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous',
        'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR',
        'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded',
        'PctVacMore6Mos', 'MedYrHousBuilt', 'PctHousNoPhone', 'PctWOFullPlumb', 'OwnOccLowQuart',
        'OwnOccMedVal', 'OwnOccHiQuart', 'RentLowQ', 'RentMedian', 'RentHighQ', 'MedRent',
        'MedRentPctHousInc', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters',
        'NumStreet', 'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85',
        'PctSameState85', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps',
        'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic',
        'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp',
        'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz',
        'PolicAveOTWorked', 'LandArea', 'PopDens', 'PctUsePubTrans', 'PolicCars', 'PolicOperBudg',
        'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'LemasPctOfficDrugUn', 'PolicBudgPerPop',
        'ViolentCrimesPerPop'
    ]
    
    # Carregar dados
    df = pd.read_csv('communities.data', names=columns, na_values='?')
    return df

In [None]:
def analyze_data(df):
    print("=== ANÁLISE EXPLORATÓRIA DOS DADOS ===")
    print(f"Dimensões do dataset: {df.shape}")
    print(f"\nVariável alvo: ViolentCrimesPerPop")
    print(f"Valores únicos na variável alvo: {df['ViolentCrimesPerPop'].nunique()}")
    print(f"Valores faltantes na variável alvo: {df['ViolentCrimesPerPop'].isnull().sum()}")
    
    # Análise de valores faltantes
    missing_data = df.isnull().sum()
    missing_percent = (missing_data / len(df)) * 100
    missing_df = pd.DataFrame({
        'Coluna': missing_data.index,
        'Valores_Faltantes': missing_data.values,
        'Percentual': missing_percent.values
    })
    missing_df = missing_df[missing_df['Valores_Faltantes'] > 0].sort_values('Percentual', ascending=False)
    
    print(f"\n=== VALORES FALTANTES ===")
    print(f"Total de colunas com valores faltantes: {len(missing_df)}")
    print("\nTop 10 colunas com mais valores faltantes:")
    print(missing_df.head(10))
    
    return missing_df

In [None]:
def preprocess_data(df):
    print("\n=== PRÉ-PROCESSAMENTO DOS DADOS ===")
    
    # Remover colunas não preditivas
    non_predictive = ['state', 'county', 'community', 'communityname', 'fold']
    df_clean = df.drop(columns=non_predictive)
    
    # Separar features e target
    X = df_clean.drop('ViolentCrimesPerPop', axis=1)
    y = df_clean['ViolentCrimesPerPop']
    
    # Remover linhas onde a variável alvo é nula
    mask = ~y.isnull()
    X = X[mask]
    y = y[mask]
    
    print(f"Dados após remoção de valores nulos na variável alvo: {X.shape}")
    
    # Análise de colunas com muitos valores faltantes
    missing_threshold = 0.5  # 50%
    missing_cols = X.columns[X.isnull().mean() > missing_threshold].tolist()
    
    print(f"\nColunas com mais de {missing_threshold*100}% de valores faltantes:")
    for col in missing_cols:
        pct = X[col].isnull().mean() * 100
        print(f"  {col}: {pct:.1f}%")
    
    # Estratégia de tratamento de valores faltantes
    print(f"\n=== ESTRATÉGIAS DE TRATAMENTO ===")
    print("1. Colunas com >50% de valores faltantes: Removidas")
    print("2. Demais colunas: Imputação pela mediana")
    
    # Remover colunas com muitos valores faltantes
    X_processed = X.drop(columns=missing_cols)
    
    # Imputar valores faltantes restantes com a mediana
    imputer = SimpleImputer(strategy='median')
    X_processed = pd.DataFrame(
        imputer.fit_transform(X_processed),
        columns=X_processed.columns,
        index=X_processed.index
    )
    
    print(f"Dimensões finais dos dados: {X_processed.shape}")
    print(f"Valores faltantes restantes: {X_processed.isnull().sum().sum()}")
    
    return X_processed, y, missing_cols, imputer

In [None]:
def train_model(X, y):
    print("\n=== DIVISÃO DOS DADOS E TREINAMENTO ===")
    
    # Divisão treino/teste (70/30)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    print(f"Tamanho do conjunto de treino: {X_train.shape[0]} ({X_train.shape[0]/len(X)*100:.1f}%)")
    print(f"Tamanho do conjunto de teste: {X_test.shape[0]} ({X_test.shape[0]/len(X)*100:.1f}%)")
    
    # Treinar modelo de regressão linear
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predições
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Métricas
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    print(f"\n=== RESULTADOS DO MODELO ===")
    print(f"TREINO - RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}")
    print(f"TESTE  - RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}")
    print(f"R² Score (treino): {model.score(X_train, y_train):.4f}")
    print(f"R² Score (teste): {model.score(X_test, y_test):.4f}")
    
    return model, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred

In [None]:
def feature_importance_analysis(model, X):
    print(f"\n=== ANÁLISE DE IMPORTÂNCIA DAS FEATURES ===")
    
    # Coeficientes do modelo
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': model.coef_,
        'Abs_Coefficient': np.abs(model.coef_)
    }).sort_values('Abs_Coefficient', ascending=False)
    
    print("Top 10 features mais importantes (por magnitude do coeficiente):")
    print(feature_importance.head(10)[['Feature', 'Coefficient']])
    
    return feature_importance

In [None]:
# Executar análise completa
df = load_data()
missing_analysis = analyze_data(df)
X, y, removed_cols, imputer = preprocess_data(df)
model, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred = train_model(X, y)
importance = feature_importance_analysis(model, X)

print(f"\n=== CONSIDERAÇÕES FINAIS ===")
print(f"1. Foram removidas {len(removed_cols)} colunas com >50% de valores faltantes")
print(f"2. Valores faltantes restantes foram imputados pela mediana")
print(f"3. O modelo final usa {X.shape[1]} features para predizer ViolentCrimesPerPop")
print(f"4. Colunas removidas: {removed_cols[:5]}..." if len(removed_cols) > 5 else f"4. Colunas removidas: {removed_cols}")