In [2]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import NuSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
# Load train dataset
train_df = pd.read_csv("C:\\Users\\nizar\\NLP_projet_Nizar_el_Mouaquit\\NLP_project\\BBC News Train.csv")
# Load test dataset
test_df = pd.read_csv("C:\\Users\\nizar\\NLP_projet_Nizar_el_Mouaquit\\NLP_project\\BBC News Test.csv")
# Print first rows of the train dataset
print("Train Dataset:")
print(train_df.head())
# Print first rows of the test dataset
print("Test Dataset:")
print(test_df.head())

Train Dataset:
   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business
Test Dataset:
   ArticleId                                               Text
0       1018  qpr keeper day heads for preston queens park r...
1       1319  software watching while you work software that...
2       1138  d arcy injury adds to ireland woe gordon d arc...
3        459  india s reliance family feud heats up the ongo...
4       1020  boro suffer morrison injury blow middlesbrough...


In [6]:

def preprocess_text(text):
    # Remove punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Add specific chosen patterns/sentences
    irrelevant_patterns = [
        r'\birrelevant_word\b',
        r'\bunwanted_sentence\b'
    ]
    for pattern in irrelevant_patterns:
        text = re.sub(pattern, '', text)
    
    # Remove whitespaces
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

# Load the train dataset from CSV file
train_df = pd.read_csv("C:\\Users\\nizar\\NLP_projet_Nizar_el_Mouaquit\\NLP_project\\BBC News Train.csv")
print(train_df)
# Apply text preprocessing on the 'text' column
train_df['Text'] = train_df['Text'].apply(preprocess_text)
# Print first rows preprocessed dataset
print("Pre-processed Train Dataset:")
print(train_df)

      ArticleId                                               Text  \
0          1833  worldcom ex-boss launches defence lawyers defe...   
1           154  german business confidence slides german busin...   
2          1101  bbc poll indicates economic gloom citizens in ...   
3          1976  lifestyle  governs mobile choice  faster  bett...   
4           917  enron bosses in $168m payout eighteen former e...   
...         ...                                                ...   
1485        857  double eviction from big brother model caprice...   
1486        325  dj double act revamp chart show dj duo jk and ...   
1487       1590  weak dollar hits reuters revenues at media gro...   
1488       1587  apple ipod family expands market apple has exp...   
1489        538  santy worm makes unwelcome visit thousands of ...   

           Category  
0          business  
1          business  
2          business  
3              tech  
4          business  
...             ...  
1485 

In [7]:
def Transform_case(text):
    # lowercase
    text = text.lower()
    return text

train_df['Text'] = train_df['Text'].apply(Transform_case)
# Print first rows of the preprocessed data
print("Transform case :")
print(train_df)

Transform case :
      ArticleId                                               Text  \
0          1833  worldcom exboss launches defence lawyers defen...   
1           154  german business confidence slides german busin...   
2          1101  bbc poll indicates economic gloom citizens in ...   
3          1976  lifestyle governs mobile choice faster better ...   
4           917  enron bosses in m payout eighteen former enron...   
...         ...                                                ...   
1485        857  double eviction from big brother model caprice...   
1486        325  dj double act revamp chart show dj duo jk and ...   
1487       1590  weak dollar hits reuters revenues at media gro...   
1488       1587  apple ipod family expands market apple has exp...   
1489        538  santy worm makes unwelcome visit thousands of ...   

           Category  
0          business  
1          business  
2          business  
3              tech  
4          business  
...       

In [8]:
nltk.download('stopwords')
nltk.download('punkt')
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.casefold() not in stop_words]
    return ' '.join(filtered_text)

train_df['Text'] = train_df['Text'].apply(remove_stopwords)
print(train_df)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nizar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nizar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


      ArticleId                                               Text  \
0          1833  worldcom exboss launches defence lawyers defen...   
1           154  german business confidence slides german busin...   
2          1101  bbc poll indicates economic gloom citizens maj...   
3          1976  lifestyle governs mobile choice faster better ...   
4           917  enron bosses payout eighteen former enron dire...   
...         ...                                                ...   
1485        857  double eviction big brother model caprice holb...   
1486        325  dj double act revamp chart show dj duo jk joel...   
1487       1590  weak dollar hits reuters revenues media group ...   
1488       1587  apple ipod family expands market apple expande...   
1489        538  santy worm makes unwelcome visit thousands web...   

           Category  
0          business  
1          business  
2          business  
3              tech  
4          business  
...             ...  
1485 

In [9]:
def tokenize_text(text):
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    return tokens
# example
train_df['Text'] = train_df['Text'].apply(tokenize_text)
print(train_df)

      ArticleId                                               Text  \
0          1833  [worldcom, exboss, launches, defence, lawyers,...   
1           154  [german, business, confidence, slides, german,...   
2          1101  [bbc, poll, indicates, economic, gloom, citize...   
3          1976  [lifestyle, governs, mobile, choice, faster, b...   
4           917  [enron, bosses, payout, eighteen, former, enro...   
...         ...                                                ...   
1485        857  [double, eviction, big, brother, model, capric...   
1486        325  [dj, double, act, revamp, chart, show, dj, duo...   
1487       1590  [weak, dollar, hits, reuters, revenues, media,...   
1488       1587  [apple, ipod, family, expands, market, apple, ...   
1489        538  [santy, worm, makes, unwelcome, visit, thousan...   

           Category  
0          business  
1          business  
2          business  
3              tech  
4          business  
...             ...  
1485 

In [10]:
#list to strings
train_df['Text'] = train_df['Text'].apply(lambda x: ' '.join(x))
# Extract preprocessed text
text_data = train_df['Text'].values
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()
# Fit/transform the text data into TF-IDF features
tfidf_features = vectorizer.fit_transform(text_data)
# Print the shape of the TF-IDF features
print("Shape of TF-IDF features:", tfidf_features.shape)
# Print the feature names(vocabulary)
print("Feature names (vocabulary):")
print(vectorizer.get_feature_names())

Shape of TF-IDF features: (1490, 25213)
Feature names (vocabulary):


AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

In [11]:
# Extraction reprocessed text and corresponding labels
text_data = train_df['Text'].values
labels = train_df['Category'].values
# Data Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)
# Create an instance of NuSVC classifier
svm_classifier = NuSVC(kernel='rbf')
# Train the SVM classifier
svm_classifier.fit(X_train, y_train)
# Predict the labels for test data
y_pred = svm_classifier.predict(X_test)
# Evaluate the performance of the classifier
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
text_data = train_df['Text'].values
labels = train_df['Category'].values
# instance of TfidfVectorizer
vectorizer = TfidfVectorizer()
# Fit/transform the text data into TF-IDF features
tfidf_features = vectorizer.fit_transform(text_data)
# Split data to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)
# Create instance of NuSVC classifier
svm_classifier = NuSVC(kernel='rbf')
# Train SVM classifier
svm_classifier.fit(X_train, y_train)
# Predict labels test data
y_pred = svm_classifier.predict(X_test)
# Evaluate the performance of the classifier
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

     business       0.90      0.99      0.94        75
entertainment       0.96      0.98      0.97        46
     politics       0.98      0.93      0.95        56
        sport       0.98      1.00      0.99        63
         tech       1.00      0.90      0.95        58

     accuracy                           0.96       298
    macro avg       0.97      0.96      0.96       298
 weighted avg       0.96      0.96      0.96       298

Accuracy: 0.959731543624161
Classification Report:
               precision    recall  f1-score   support

     business       0.90      0.99      0.94        75
entertainment       0.96      0.98      0.97        46
     politics       0.98      0.93      0.95        56
        sport       0.98      1.00      0.99        63
         tech       1.00      0.90      0.95        58

     accuracy                           0.96       298
    macro avg       0.97      0.96      0

In [12]:
# Extract preprocessed text from test DataFrame
test_data = test_df['Text'].apply(preprocess_text)
test_data = test_df['Text'].apply(Transform_case)
test_data = test_df['Text'].apply(remove_stopwords)
test_data = test_df['Text'].apply(tokenize_text)
# Join the tokens into strings for each document in the test DataFrame
test_data = test_data.apply(lambda tokens: ' '.join(tokens))
# Transform the test data into TF-IDF features using the same vectorizer
test_tfidf_features = vectorizer.transform(test_data.values)
# Predict labels/test data
test_pred = svm_classifier.predict(test_tfidf_features)
# Print test data/predictions
print("Test Data:")
print(test_data)
print("Test Predictions:")
print(test_pred)

Test Data:
0      qpr keeper day heads for preston queens park r...
1      software watching while you work software that...
2      d arcy injury adds to ireland woe gordon d arc...
3      india s reliance family feud heats up the ongo...
4      boro suffer morrison injury blow middlesbrough...
                             ...                        
730    eu to probe alitalia state aid the european co...
731    u2 to play at grammy awards show irish rock ba...
732    sport betting rules in spotlight a group of mp...
733    alfa romeos to get gm engines fiat is to stop ...
734    citizenship event for 18s touted citizenship c...
Name: Text, Length: 735, dtype: object
Test Predictions:
['sport' 'tech' 'sport' 'business' 'sport' 'sport' 'politics' 'politics'
 'entertainment' 'business' 'business' 'tech' 'politics' 'tech'
 'entertainment' 'sport' 'politics' 'tech' 'entertainment' 'entertainment'
 'business' 'politics' 'sport' 'business' 'politics' 'sport' 'business'
 'sport' 'sport' 'bus