In [1]:
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

#import the ML libs
import sklearn
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')


print("scikit-learn version:", sklearn.__version__)    
print("pandas version:", pd.__version__)            
print("seaborn version:", sns.__version__)  
print("matplotlib version:", matplotlib.__version__) 

scikit-learn version: 1.3.0
pandas version: 1.5.3
seaborn version: 0.12.2
matplotlib version: 3.7.1


In [2]:
df = pd.read_csv("traindata.csv")

print(df.shape)

(20800, 5)


### 📃 Sample the data

Below I am viewing dataset to have an impression of the data. For that I am printing 15 random rows.

In [3]:
df.sample(15)

Unnamed: 0,id,title,author,text,label
6175,6175,Self-Driving Tesla Was Involved in Fatal Crash...,Bill Vlasic and Neal E. Boudette,DETROIT — The race by automakers and techno...,0
5613,5613,"Veteran’s Memorial Desecrated By Protesters, C...",Matthew Bernstein,"\nPosted by Matthew Bernstein | Nov 9, 2016 | ...",1
17937,17937,Trump Makes Last-Minute Push To Appeal To Whit...,,Hillary Clinton Waiting In Wings Of Stage Sinc...,1
4403,4403,,pascalmolineaux,"The way the ""Defense Budget"" in the USA contin...",1
8529,8529,Don’t Call Them ‘Mocktails’ - The New York Times,Rosie Schaap,I’m always thrilled when a certain former drin...,0
10086,10086,London Gets More N.F.L. Games. Too Bad About t...,Victor Mather,"Hey, London! You love American football? Well,...",0
17279,17279,"In 9/11 Document, View of a Saudi Effort to Th...",Mark Mazzetti,WASHINGTON — The document detailing possi...,0
5031,5031,The Verdant Food of Iran Entices at Persian Ne...,Samin Nosrat,Pardis Stitt still remembers the grocery shopp...,0
1203,1203,Donald Trump Says Hillary Clinton Will Start W...,,Donald Trump Says Hillary Clinton Will Start W...,1
5443,5443,Stars at Obama’s White House Farewell Bash Par...,Jerome Hudson,A litany of celebrities braved temperature...,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
# Checking dataset
df.head(20)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


In [6]:
# checking labels

df.label.unique()

array([1, 0], dtype=int64)

In [7]:
# Analyse the labels distribution
df.label.value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [11]:
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetModel

# Choose the variant of XLNet you want to use
model_name = "xlnet-base-cased"  # For example, "xlnet-base-cased" or "xlnet-large-cased"

# Load XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained(model_name)
graph = tf.compat.v1.get_default_graph()
with graph.as_default():
    model = TFXLNetModel.from_pretrained(model_name)







RuntimeError: x: Attempting to capture an EagerTensor without building a function.

In [12]:

max_len  = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")

NameError: name 'sentences' is not defined

In [13]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [14]:
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification

# Step 2: Download Pretrained XLNet Model
model_name = "xlnet-base-cased"
tokenizer = XLNetTokenizer.from_pretrained(model_name)
model = XLNetForSequenceClassification.from_pretrained(model_name)

# Step 4: Tokenization
input_text = "Your input text here."
tokenized_input = tokenizer(input_text, return_tensors="pt")

# Step 5: Model Inference
with torch.no_grad():
    outputs = model(**tokenized_input)

# Step 6: Post-processing
predictions = torch.softmax(outputs.logits, dim=1)

# Output probabilities for each class
print(predictions)


ModuleNotFoundError: No module named 'torch'