# Terra Signal Hackathon
This notebook is provided as a starting point. Feel free to use it, discard it, modify it, or pretend it doesn't exist.

In [0]:
%pip install pandas

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file using pandas
file_path = "./history.csv"
df = pd.read_csv(file_path)
df.head().transpose()

Unnamed: 0,0,1,2,3,4
customerID,4578-PHJYZ,6289-CPNLD,2682-KEVRP,5697-GOMBF,9717-QEBGU
gender,Male,Male,Female,Female,Male
SeniorCitizen,0,0,1,1,0
Partner,Yes,Yes,No,Yes,No
Dependents,Yes,Yes,No,Yes,No
tenure,52,33,22,28,2
PhoneService,Yes,Yes,Yes,No,Yes
MultipleLines,No,No,No,No phone service,No
InternetService,DSL,DSL,No,DSL,Fiber optic
OnlineSecurity,No,No,No internet service,No,No


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5634 non-null   object 
 1   gender            5634 non-null   object 
 2   SeniorCitizen     5634 non-null   int64  
 3   Partner           5634 non-null   object 
 4   Dependents        5634 non-null   object 
 5   tenure            5634 non-null   object 
 6   PhoneService      5634 non-null   object 
 7   MultipleLines     5634 non-null   object 
 8   InternetService   5634 non-null   object 
 9   OnlineSecurity    5634 non-null   object 
 10  OnlineBackup      5634 non-null   object 
 11  DeviceProtection  5634 non-null   object 
 12  TechSupport       5634 non-null   object 
 13  StreamingTV       5634 non-null   object 
 14  StreamingMovies   5634 non-null   object 
 15  Contract          5634 non-null   object 
 16  PaperlessBilling  5634 non-null   object 


In [17]:
for col in df.columns:
    print(f"\n--- {col} ---")
    print(df[col].value_counts())



--- customerID ---
customerID
4578-PHJYZ    1
3806-YAZOV    1
1450-GALXR    1
3870-SPZSI    1
2296-DKZFP    1
             ..
2676-SSLTO    1
5577-OTWWW    1
2853-CWQFQ    1
3301-VKTGC    1
3915-ODIYG    1
Name: count, Length: 5634, dtype: int64

--- gender ---
gender
Male      2835
Female    2799
Name: count, dtype: int64

--- SeniorCitizen ---
SeniorCitizen
0    4706
1     928
Name: count, dtype: int64

--- Partner ---
Partner
No     2907
Yes    2727
Name: count, dtype: int64

--- Dependents ---
Dependents
No     3949
Yes    1685
Name: count, dtype: int64

--- tenure ---
tenure
1          475
72         266
2          188
3          161
4          144
          ... 
44          41
36          39
39          37
unknown     19
0           10
Name: count, Length: 74, dtype: int64

--- PhoneService ---
PhoneService
Yes    5041
No      523
no       40
yes      30
Name: count, dtype: int64

--- MultipleLines ---
MultipleLines
No                  2704
Yes                 2367
No phone serv

Limpeza dos dados

In [23]:
# removendo coluna irrelevante
df_clean = df.copy()
df_clean = df_clean.drop(columns=["customerID"])

# limpando a coluna 'tenure'

# substituindo 'unknown' por NaN e convertendo para numérico
df_clean["tenure"] = df_clean["tenure"].replace("unknown", pd.NA)
df_clean["tenure"] = pd.to_numeric(df_clean["tenure"], errors="coerce")

# substituindo valores 0 por NaN
df_clean.loc[df_clean["tenure"] == 0, "tenure"] = pd.NA

# preenchendo NaN com a mediana e convertendo para inteiro
df_clean["tenure"] = df_clean["tenure"].fillna(df_clean["tenure"].median())
df_clean["tenure"] = df_clean["tenure"].astype(int)

# normalizando phone service
df_clean["PhoneService"] = (
    df_clean["PhoneService"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({"yes": 1, "no": 0})
    .astype(int)  # <- evitar FutureWarning
)

# normalizando multiple lines
df_clean["MultipleLines"] = (
    df_clean["MultipleLines"]
    .replace({"No phone service": "No"})
    .map({"Yes": 1, "No": 0})
    .astype(int)
)

# normalizando colunas de internet
internet_cols = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]

for col in internet_cols:
    df_clean[col] = (
        df_clean[col]
        .replace({"No internet service": "No"})
        .map({"Yes": 1, "No": 0})
        .astype(int)
    )

# normalizando colunas binárias
for col in ["Partner", "Dependents", "PaperlessBilling"]:
    df_clean[col] = df_clean[col].map({"Yes": 1, "No": 0}).astype(int)

# convertendo total charges para numérico e tratando NaN
df_clean["TotalCharges"] = pd.to_numeric(df_clean["TotalCharges"], errors="coerce")
df_clean["TotalCharges"] = df_clean["TotalCharges"].fillna(df_clean["TotalCharges"].median())

# limpando coluna de feedback do cliente
df_clean["CustomerFeedback"] = df_clean["CustomerFeedback"].fillna("").astype(str)
df_clean["CustomerFeedback_clean"] = (
    df_clean["CustomerFeedback"]
    .str.lower()
    .str.replace("[^a-zA-Z0-9 ]", "", regex=True)
)

# tratando categóricas com one-hot encoding
cat_cols = [
    "gender", "InternetService", "Contract", "PaymentMethod"
]

df_clean = pd.get_dummies(df_clean, columns=cat_cols, drop_first=True)

# convertendo target para binário
df_clean["Churn"] = df_clean["Churn"].map({"Yes": 1, "No": 0}).astype(int)


  .replace({"yes": 1, "no": 0})


In [24]:
df_clean.info()
df_clean.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 27 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          5634 non-null   int64  
 1   Partner                                5634 non-null   int32  
 2   Dependents                             5634 non-null   int32  
 3   tenure                                 5634 non-null   int32  
 4   PhoneService                           5634 non-null   int32  
 5   MultipleLines                          5634 non-null   int32  
 6   OnlineSecurity                         5634 non-null   int32  
 7   OnlineBackup                           5634 non-null   int32  
 8   DeviceProtection                       5634 non-null   int32  
 9   TechSupport                            5634 non-null   int32  
 10  StreamingTV                            5634 non-null   int32  
 11  Stre

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,MonthlyIncome,CustomerFeedback_clean,gender_Male,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,1,52,1,0,0,1,1,1,...,6532,i have been a customer with this internet prov...,True,False,False,True,False,False,True,False
1,0,1,1,33,1,0,0,0,1,1,...,7634,ive been a customer with this company for over...,True,False,False,True,False,False,False,True
2,1,0,0,22,1,0,0,0,0,0,...,3628,i have been a customer with this company for 2...,False,False,True,True,False,False,False,True
3,1,1,1,28,0,0,0,0,0,0,...,7851,i have been a customer with this internet prov...,False,False,False,False,False,False,True,False
4,0,0,0,2,1,0,0,0,0,0,...,1691,i have been using the fiber optic internet ser...,True,True,False,False,False,False,True,False


In [0]:
import datetime

def prediction_function(input_df):
    '''
    An example model function, that just predicts randomly whether a customer will churn.
    TODO: Make a better model.
    '''
    X = input_df[['customerID']].copy()
    X['prediction'] = np.random.uniform(size=len(X)) >= 0.5
    X['prediction'] = X['prediction'].map({True: 'Yes', False: 'No'})
    return X

test_df = pd.read_csv('inference.csv')
prediction = prediction_function(test_df)
print(prediction.head().transpose())
# Use this code to save the prediction to a csv file for submission:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
prediction.to_csv(f'prediction_<MY_GROUP_NAME>_{timestamp}.csv')