In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from urllib.parse import urlparse

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
domain_index = pd.read_csv('../02_data/data_archive/raphaela/lasser/domain_pc1.csv')
chat_index = np.load('../02_data/telegram/telegram/chat_index.npy')
urls_sample = pd.read_csv('../02_data/url_sample_with_titles.csv')

In [None]:
urls = pd.read_csv('../02_data/urls_with_titles.csv')

In [3]:
print("Chat Index Shape:", chat_index.shape)
print(urls_sample.head())

Domain Index Shape: (7277,)
Chat Index Shape: (27592,)
         id                                                url  \
0  31404053  https://www.indiatoday.in/india/story/wrestler...   
1  55066318  https://www.nytimes.com/2019/08/15/sports/base...   
2  52148717  https://bigota.d.miui.com/V13.0.3.0.SKGINXM/mi...   
3   2233418                                 http://ptv.io/2Jrx   
4  17089736                       https://youtu.be/j5fCqKbSC7M   

                   start_date                    end_date  
0  2023-05-04 09:33:21.000000  2023-05-04 09:33:21.000000  
1  2019-08-16 06:19:03.000000  2019-08-16 06:19:03.000000  
2  2022-09-09 03:09:19.000000  2022-09-09 03:09:19.000000  
3  2023-06-08 00:12:11.574968  2023-06-17 01:11:31.037103  
4  2021-07-18 00:56:24.000000  2023-07-25 16:10:55.579514  


In [None]:
# Plot the distribution of the 'pc1' column
plt.figure(figsize=(10, 6))
plt.hist(domain_index['pc1'], bins=30, color='blue', alpha=0.7)
plt.title('Distribution of pc1')
plt.xlabel('pc1')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()

# Show the row with the highest 'pc1' score
highest_scores = domain_index.nlargest(5, 'pc1')
print("Top 5 rows with the highest pc1 scores:")
print(highest_scores)

# Show the 5 rows with the lowest 'pc1' scores
lowest_scores = domain_index.nsmallest(5, 'pc1')
print("\nBottom 5 rows with the lowest pc1 scores:")
print(lowest_scores)


In [4]:
# Extract domain from URL 
def extract_domain(url):
    try:
        parsed_url = urlparse(url)
        return parsed_url.netloc
    except Exception as e:
        print(f"Error parsing URL: {url} - {e}")
        return None

# Apply the function to extract the domain and handle any errors
urls_sample['domain'] = urls_sample['url'].apply(extract_domain)

# Display the first few rows with the extracted domain
print("URLs Sample with Domain:\n", urls_sample.head())

Error parsing URL: http://www.successors.ca](http://www.successors.ca/): - Invalid IPv6 URL
Error parsing URL: http://[host_port] - 'host_port' does not appear to be an IPv4 or IPv6 address
URLs Sample with Domain:
          id                                                url  \
0  31404053  https://www.indiatoday.in/india/story/wrestler...   
1  55066318  https://www.nytimes.com/2019/08/15/sports/base...   
2  52148717  https://bigota.d.miui.com/V13.0.3.0.SKGINXM/mi...   
3   2233418                                 http://ptv.io/2Jrx   
4  17089736                       https://youtu.be/j5fCqKbSC7M   

                   start_date                    end_date             domain  
0  2023-05-04 09:33:21.000000  2023-05-04 09:33:21.000000  www.indiatoday.in  
1  2019-08-16 06:19:03.000000  2019-08-16 06:19:03.000000    www.nytimes.com  
2  2022-09-09 03:09:19.000000  2022-09-09 03:09:19.000000  bigota.d.miui.com  
3  2023-06-08 00:12:11.574968  2023-06-17 01:11:31.037103             p

In [5]:
# Step 2: Match domains with domain_index
urls_sample = pd.merge(urls_sample, domain_index, on='domain', how='inner')

# Display the first few rows of the merged DataFrame
print(urls_sample.head())

# Drop rows where the domain was not found in domain_index
urls_sample.dropna(subset=['domain_idx'], inplace=True)

In [6]:
# Assuming chat_index corresponds to rows in urls_sample, add it to the DataFrame - this is RANDOM
urls_sample['chat_idx'] = chat_index[:len(urls_sample)]

In [7]:
urls_sample['start_date'] = pd.to_datetime(urls_sample['start_date'])

urls_sample['start_year'] = urls_sample['start_date'].dt.year
urls_sample['start_month'] = urls_sample['start_date'].dt.month
urls_sample['start_day'] = urls_sample['start_date'].dt.day
urls_sample['start_hour'] = urls_sample['start_date'].dt.hour

In [11]:
urls_sample

Unnamed: 0,id,url,start_date,end_date,domain,domain_idx,chat_idx,start_year,start_month,start_day,start_hour
10,60580487,https://ift.tt/2Hfa9Dy,2018-04-20 21:03:26.000000,2018-04-20 21:03:26.000000,ift.tt,174.0,65541,2018,4,20,21
14,50013707,https://telegra.ph/file/6b3718f03ef7a58cf6e9e.jpg,2019-04-28 22:00:26.000000,2019-04-28 22:00:26.000000,telegra.ph,811.0,196615,2019,4,28,22
24,46729961,https://ift.tt/OiQ4GwH,2022-07-06 13:13:06.000000,2022-07-06 13:13:06.000000,ift.tt,174.0,196616,2022,7,6,13
31,8296784,https://ift.tt/3kPM3Vy,2021-11-18 23:43:13.000000,2021-11-18 23:43:13.000000,ift.tt,174.0,65544,2021,11,18,23
34,39881019,https://telegra.ph/file/b0fc8e4a669b4adeb01f9.png,2021-03-18 11:10:03.000000,2021-03-18 11:10:03.000000,telegra.ph,811.0,196617,2021,3,18,11
...,...,...,...,...,...,...,...,...,...,...,...
99951,59092647,https://gplinks.co/A4EGqk,2020-09-03 20:38:18.000000,2020-09-03 20:38:18.000000,gplinks.co,6982.0,34318,2020,9,3,20
99955,1911447,https://nationalfile.com/pennsylvania-gop-esta...,2022-09-25 21:18:59.000000,2023-07-25 12:30:15.280887,nationalfile.com,332.0,34319,2022,9,25,21
99984,27281377,https://ift.tt/odaHKDW,2022-04-04 15:21:39.000000,2022-04-04 15:21:39.000000,ift.tt,174.0,34322,2022,4,4,15
99987,3863500,https://dcenquirer.com/house-republicans-kevin...,2023-06-08 06:32:54.321559,2023-06-08 06:32:54.321559,dcenquirer.com,1666.0,34325,2023,6,8,6


In [None]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # This is a lightweight and efficient model

# Convert the title column to embeddings
urls_sample['title_embedding'] = urls_sample['title'].apply(lambda x: model.encode(str(x)))

In [9]:
# Step 3: Combine Features
# Use the domain index and chat index as features
# Extract the features for the MLP (concatenate start_year and title embedding)
X = urls_sample.apply(lambda row: [row['start_year']] + list(row['title_embedding']), axis=1).tolist()

# Convert the list of lists into a numpy array for training the MLP
X = np.array(X)

# Display the shape of the input features
print(X.shape)

#X = urls_sample[['start_year', 'chat_idx']].values
y = urls_sample['pc1'].values  

In [10]:
# Step 4: Train an MLP Model
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the MLP regressor
mlp = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

# Make predictions
y_pred = mlp.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Display the first few predictions
print("Predictions vs Actual:\n", pd.DataFrame({'Predicted': y_pred[:5], 'Actual': y_test[:5]}))

from sklearn.metrics import r2_score

# Calculate R² for the model
r2 = r2_score(y_test, y_pred)
print(f'R² Score: {r2}')

Mean Squared Error: 3295279.7737017632
Predictions vs Actual:
      Predicted  Actual
0  1457.877087   280.0
1  1434.203992   174.0
2  1533.773000  1248.0
3  1443.256572  1274.0
4  1683.800110   174.0
