In [None]:
import pandas as pd

# Carregar o conjunto de dados
data_path = 'top_insta_influencers_data.csv'
data = pd.read_csv(data_path)

# Mostrar as primeiras linhas do dataset para entender sua estrutura
data.head()


Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country
0,1,cristiano,92,3.3k,475.8m,8.7m,1.39%,6.5m,29.0b,Spain
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States


In [10]:
def convert_to_numeric(value):
    if isinstance(value, str):
        if 'k' in value:
            return float(value.replace('k', '')) * 1e3
        elif 'm' in value:
            return float(value.replace('m', '')) * 1e6
        elif 'b' in value:
            return float(value.replace('b', '')) * 1e9
        elif '%' in value:
            return float(value.replace('%', '')) / 100
    return value

# Aplicar a função de conversão para as colunas necessárias
data['posts'] = data['posts'].apply(convert_to_numeric)
data['followers'] = data['followers'].apply(convert_to_numeric)
data['avg_likes'] = data['avg_likes'].apply(convert_to_numeric)
data['60_day_eng_rate'] = data['60_day_eng_rate'].apply(convert_to_numeric)
data['new_post_avg_like'] = data['new_post_avg_like'].apply(convert_to_numeric)
data['total_likes'] = data['total_likes'].apply(convert_to_numeric)

# Verificar os primeiros dados convertidos
data.head()

Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country
0,1,cristiano,92,3300.0,475800000.0,8700000.0,0.0139,6500000.0,29000000000.0,Spain
1,2,kyliejenner,91,6900.0,366200000.0,8300000.0,0.0162,5900000.0,57400000000.0,United States
2,3,leomessi,90,890.0,357300000.0,6800000.0,0.0124,4400000.0,6000000000.0,
3,4,selenagomez,93,1800.0,342700000.0,6200000.0,0.0097,3300000.0,11500000000.0,United States
4,5,therock,91,6800.0,334100000.0,1900000.0,0.002,665300.0,12500000000.0,United States


In [12]:
# Remover linhas com valores ausentes na variável dependente
data = data.dropna(subset=['60_day_eng_rate'])

# Agora podemos redefinir X e y sem valores ausentes
X = data[['followers', 'avg_likes', 'new_post_avg_like', 'total_likes']]
y = data['60_day_eng_rate']


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression


# Dividir o conjunto em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Padronizar os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Seleção de variáveis (SelectKBest)
best_features = SelectKBest(score_func=f_regression, k='all')
X_train_best = best_features.fit_transform(X_train_scaled, y_train)
X_test_best = best_features.transform(X_test_scaled)

# Treinamento e avaliação do modelo para cada combinação de variáveis
results = []
for k in range(1, X_train_best.shape[1] + 1):
    # Seleciona as k melhores variáveis
    X_train_k = X_train_best[:, :k]
    X_test_k = X_test_best[:, :k]

    # Treinamento do modelo
    model = LinearRegression()
    model.fit(X_train_k, y_train)

    # Previsão e avaliação
    y_pred = model.predict(X_test_k)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Salva os resultados
    results.append((k, r2, mse, mae))

# Exibir os resultados para cada combinação de variáveis
for result in results:
    print(f'Número de variáveis: {result[0]}, R²: {result[1]:.4f}, MSE: {result[2]:.4f}, MAE: {result[3]:.4f}')


Número de variáveis: 1, R²: 0.0216, MSE: 0.0006, MAE: 0.0159
Número de variáveis: 2, R²: 0.5609, MSE: 0.0003, MAE: 0.0103
Número de variáveis: 3, R²: 0.9528, MSE: 0.0000, MAE: 0.0035
Número de variáveis: 4, R²: 0.9334, MSE: 0.0000, MAE: 0.0044
