In [107]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import statsmodels.api as sm
import numpy as np

In [108]:
# Nama database dan collection
db_name = 'regresiPossionAnalisisSentimenYTRewind'
collection_name = 'preprocessingData2023'

# Memuat value dari file .env
load_dotenv()

mongodb_url = os.getenv('URL_SANDY')
local_url = os.getenv('URL_LOCAL')

# Membuat koneksi ke MongoDB
client = MongoClient(local_url)
db = client[db_name]
preprocessing_data = db[collection_name]

#### Melakukan Analisis

In [109]:
# Mengambil data dari koleksi MongoDB
data = list(preprocessing_data.find())

# Konversi data menjadi dataframe
df = pd.DataFrame(data)

# Menghitung panjang komentar dalam jumlah kata
df['comment_length'] = df['textOriginal'].apply(lambda x: len(x.split()))

# Menghitung waktu sejak komentar diposting dalam detik
df['publishedAt'] = pd.to_datetime(df['publishedAt']).dt.tz_localize(None)
df['time_since_posted'] = (pd.Timestamp.now() - df['publishedAt']).dt.total_seconds()

# Konversi kolom 'is_reply' menjadi numerik jika belum
df['is_reply'] = df['is_reply'].astype(int)

# Menambahkan intercept ke dalam model
df['intercept'] = 1

# Mendefinisikan variabel independen dan dependen
X = df[['intercept', 'likeCount', 'time_since_posted', 'is_reply']]
y = df['comment_length']

# Membangun model regresi Poisson
poisson_model = sm.GLM(y, X, family=sm.families.Poisson()).fit()

print(poisson_model.summary())

# Prediksi jumlah kata
df['predicted_comment_length'] = poisson_model.predict(X)

# Melihat hasil prediksi vs nilai aktual
print(df[['comment_length', 'predicted_comment_length']].head(10))

                 Generalized Linear Model Regression Results                  
Dep. Variable:         comment_length   No. Observations:                 1000
Model:                            GLM   Df Residuals:                      996
Model Family:                 Poisson   Df Model:                            3
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -5748.9
Date:                Fri, 21 Jun 2024   Deviance:                       6910.4
Time:                        18:04:59   Pearson chi2:                 9.09e+03
No. Iterations:                     6   Pseudo R-squ. (CS):             0.4371
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
intercept             4.9156      0.07

In [110]:
df[['comment_length', 'predicted_comment_length']]

Unnamed: 0,comment_length,predicted_comment_length
0,44,46.817109
1,32,19.894571
2,13,19.504321
3,20,19.197910
4,34,19.824561
...,...,...
995,17,18.325848
996,7,18.333819
997,4,18.336510
998,8,9.871552
