In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Data Preprocessing and Cleaning
# Load the dataset using pandas
train_data = pd.read_csv("./drive/MyDrive/Colab Notebooks/Dataset/articles.csv", encoding='cp1252')


# Keep only the "Full_Article" column
train_data = train_data[["Full_Article", "Article_Type"]]

def preprocess_text(text):
    # Remove HTML tags and links (URLs) using regular expressions
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    # Remove special characters and punctuations, keeping only text and numbers
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text



train_data['Full_Article'] = train_data['Full_Article'].apply(preprocess_text)

train_data


Unnamed: 0,Full_Article,Article_Type
0,The helicopter that crashed in Southeast Alask...,Commercial
1,A year after teasing the fledgling electric ve...,Commercial
2,Bell released the fullsize design of the verti...,Commercial
3,Bell est une socieacuteteacute ameacutericaine...,Commercial
4,It was still anecdotal to observe the explosio...,Commercial
...,...,...
4300,Long rumored tensions came to a head as the CH...,Military
4301,In the course of upcoming investments in new t...,Commercial
4302,At Bell Flight in Fort Worth engineers are wor...,Military
4303,Getting to this price point took about five ye...,Commercial


In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
Colle

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch


# Initialize SentenceBERT (SBERT) tokenizer and model
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"  # You can choose a different SBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define a function to convert text into numerical vectors
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

# Apply the text representation to your dataset
train_data['Full_Article_Embeddings'] = train_data['Full_Article'].apply(encode_text)

# Now, data['Full_Article_Embeddings'] contains the numerical vectors for each article with cleaned text.

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [4]:
train_data['Full_Article_Embeddings']

0       [[tensor(-0.0055), tensor(0.1066), tensor(-0.2...
1       [[tensor(-0.1871), tensor(0.0416), tensor(-0.2...
2       [[tensor(0.1835), tensor(0.1684), tensor(-0.20...
3       [[tensor(-0.0033), tensor(-0.0525), tensor(-0....
4       [[tensor(-0.0326), tensor(-0.0071), tensor(-0....
                              ...                        
4300    [[tensor(-0.4331), tensor(0.0852), tensor(0.02...
4301    [[tensor(-0.0427), tensor(-0.0588), tensor(-0....
4302    [[tensor(-0.1681), tensor(-0.1213), tensor(-0....
4303    [[tensor(-0.0504), tensor(0.0095), tensor(-0.0...
4304    [[tensor(-0.3176), tensor(0.0891), tensor(-0.0...
Name: Full_Article_Embeddings, Length: 4305, dtype: object

In [28]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Step 1: Prepare the data
X = np.vstack(train_data['Full_Article_Embeddings'].to_numpy())  # Convert embeddings to a 2D NumPy array
y = train_data['Article_Type']  # Assuming Article_Type is already encoded


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from joblib import dump, load
from sklearn.model_selection import cross_val_score

param_grid = {'C': [0.1, 1, 10]}
classifier = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

print("Best hyperparameters:", grid_search.best_params_)

# Step 3: Evaluate Model Performance with Cross-Validation
best_classifier = grid_search.best_estimator_
results = cross_val_score(best_classifier, X, y, cv=5, scoring='accuracy')
print("Cross-validation accuracy:", results.mean())

# Step 4: Save the Best Model to Disk
dump(best_classifier, './drive/MyDrive/Colab Notebooks/logistic_regression_model.joblib')

Best hyperparameters: {'C': 0.1}
Cross-validation accuracy: 0.8682926829268294


['./drive/MyDrive/Colab Notebooks/logistic_regression_model.joblib']

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load the CSV file containing the links
csv_file = './drive/MyDrive/Colab Notebooks/Dataset/unknown_articles.csv'  # Replace with the actual file path
data = pd.read_csv(csv_file)

# Function to extract text from a given URL
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for successful response
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the text content from the webpage (modify as needed)
        text = ''
        for paragraph in soup.find_all('p'):
            text += paragraph.get_text() + '\n'

        return text

    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return None

# Create a new column in the DataFrame to store the extracted text
data['Extracted_Text'] = data['Article.URL'].apply(extract_text_from_url)

# Save the DataFrame with extracted text to a new CSV file
output_csv = './drive/MyDrive/Colab Notebooks/Dataset/Extracted_Data/unknown_articles_with_text.csv'
data.to_csv(output_csv, index=False)


Error extracting text from https://attain.news/community/special-sea-king-flypast-at-the-royal-junior-school: 404 Client Error: Not Found for url: https://attain.guide/news/special-sea-king-flypast-at-the-royal-junior-school/
Error extracting text from https://www.atlasinfo.fr/Marrakech-Air-Show-2018-200-exposants-et-100-delegations-etrangeres-attendus-a-la-6eme-edition_a94566.html: 404 Client Error: Not Found for url: https://atlasinfo.fr/Marrakech-Air-Show-2018-200-exposants-et-100-delegations-etrangeres-attendus-a-la-6eme-edition_a94566.html
Error extracting text from http://evtol.news/2018/11/05/bell-furthers-multiple-evtol-efforts/: 500 Server Error: Internal Server Error for url: https://evtol.news/2018/11/05/bell-furthers-multiple-evtol-efforts/
Error extracting text from https://www.newsoneplace.com/4085271809/army-aviation-approaches-biggest-decision-years-buy-tiltrotors-: HTTPSConnectionPool(host='www.newsoneplace.com', port=443): Max retries exceeded with url: /4085271809/ar

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Data Preprocessing and Cleaning
# Load the dataset using pandas
test_data = pd.read_csv("./drive/MyDrive/Colab Notebooks/Dataset/Extracted_Data/unknown_articles_with_text.csv", encoding='cp1252')


# Keep only the "Full_Article" column
test_data = test_data[["Extracted_Text"]]

def preprocess_text(text):
    # Remove HTML tags and links (URLs) using regular expressions
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    # Remove special characters and punctuations, keeping only text and numbers
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text



test_data['Extracted_Text'] = test_data['Extracted_Text'].apply(preprocess_text)

test_data


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch


# Initialize SentenceBERT (SBERT) tokenizer and model
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"  # You can choose a different SBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define a function to convert text into numerical vectors
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

# Apply the text representation to your dataset
test_data['Full_Article_Embeddings'] = test_data['Extracted_Text'].apply(encode_text)

In [None]:
import numpy as np

# Step 1: Prepare the data
X_test= np.vstack(test_data['Full_Article_Embeddings'].to_numpy())  # Convert embeddings to a 2D NumPy arra

In [None]:
import joblib

loaded_classifier = joblib.load('./drive/MyDrive/Colab Notebooks/logistic_regression_model.joblib')

predictions = loaded_classifier.predict(X_test)
