# Terra Signal Hackathon
This notebook is provided as a starting point. Feel free to use it, discard it, modify it, or pretend it doesn't exist.

In [0]:
%pip install pandas

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file using pandas
file_path = "./history.csv"
df = pd.read_csv(file_path)
df.head().transpose()

In [0]:
df.info()

In [0]:
for col in df.columns:
    print(f"\n--- {col} ---")
    print(df[col].value_counts())


Limpeza dos dados

In [0]:
# removendo coluna irrelevante
df_clean = df.copy()
df_clean = df_clean.drop(columns=["customerID"])

# limpando a coluna 'tenure'

# substituindo 'unknown' por NaN e convertendo para numérico
df_clean["tenure"] = df_clean["tenure"].replace("unknown", pd.NA)
df_clean["tenure"] = pd.to_numeric(df_clean["tenure"], errors="coerce")

# substituindo valores 0 por NaN
df_clean.loc[df_clean["tenure"] == 0, "tenure"] = pd.NA

# preenchendo NaN com a mediana e convertendo para inteiro
df_clean["tenure"] = df_clean["tenure"].fillna(df_clean["tenure"].median())
df_clean["tenure"] = df_clean["tenure"].astype(int)

# normalizando phone service
df_clean["PhoneService"] = (
    df_clean["PhoneService"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({"yes": 1, "no": 0})
    .astype(int)  # <- evitar FutureWarning
)

# normalizando multiple lines
df_clean["MultipleLines"] = (
    df_clean["MultipleLines"]
    .replace({"No phone service": "No"})
    .map({"Yes": 1, "No": 0})
    .astype(int)
)

# normalizando colunas de internet
internet_cols = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]

for col in internet_cols:
    df_clean[col] = (
        df_clean[col]
        .replace({"No internet service": "No"})
        .map({"Yes": 1, "No": 0})
        .astype(int)
    )

# normalizando colunas binárias
for col in ["Partner", "Dependents", "PaperlessBilling"]:
    df_clean[col] = df_clean[col].map({"Yes": 1, "No": 0}).astype(int)

# convertendo total charges para numérico e tratando NaN
df_clean["TotalCharges"] = pd.to_numeric(df_clean["TotalCharges"], errors="coerce")
df_clean["TotalCharges"] = df_clean["TotalCharges"].fillna(df_clean["TotalCharges"].median())

# limpando coluna de feedback do cliente
df_clean["CustomerFeedback"] = df_clean["CustomerFeedback"].fillna("").astype(str)
df_clean["CustomerFeedback_clean"] = (
    df_clean["CustomerFeedback"]
    .str.lower()
    .str.replace("[^a-zA-Z0-9 ]", "", regex=True)
)

# tratando categóricas com one-hot encoding
cat_cols = [
    "gender", "InternetService", "Contract", "PaymentMethod"
]

df_clean = pd.get_dummies(df_clean, columns=cat_cols, drop_first=True)

# convertendo target para binário
df_clean["Churn"] = df_clean["Churn"].map({"Yes": 1, "No": 0}).astype(int)


In [0]:
df_clean.info()
df_clean.head()
df_clean.to_csv("history_clean.csv", index=False)


In [0]:
df_clean.head()

In [0]:
import datetime

def prediction_function(input_df):
    '''
    An example model function, that just predicts randomly whether a customer will churn.
    TODO: Make a better model.
    '''
    X = input_df[['customerID']].copy()
    X['prediction'] = np.random.uniform(size=len(X)) >= 0.5
    X['prediction'] = X['prediction'].map({True: 'Yes', False: 'No'})
    return X

test_df = pd.read_csv('inference.csv')
prediction = prediction_function(test_df)
print(prediction.head().transpose())
# Use this code to save the prediction to a csv file for submission:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
prediction.to_csv(f'prediction_<MY_GROUP_NAME>_{timestamp}.csv')

Análise Exploratória