In [2]:
# Step 1: Load the Data
import pandas as pd

# Load the dataset
df = pd.read_csv('data.csv')

# Display a sample of the dataset
print("Sample of the dataset:")
print(df.sample(5))

# Display dataset information
print("\nDataset Information:")
print(df.info())

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Step 2: Data Preprocessing
# Drop unnecessary columns for this task
df = df.drop(['ID', 'URL', 'PUBLISHER', 'STORY', 'HOSTNAME', 'TIMESTAMP'], axis=1)

# Step 3: Data Exploration (if needed)
# Add exploratory data analysis code here if required

# Step 4: Feature Engineering (if needed)
# Add feature engineering code here if required

# Step 5: Text Vectorization using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer with stop words removal and a maximum document frequency of 0.7
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Transform the 'TITLE' column into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['TITLE'])

# Step 6: Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['CATEGORY'], test_size=0.2, random_state=42)

# Step 7: Train the PassiveAggressiveClassifier
from sklearn.linear_model import PassiveAggressiveClassifier

# Create and train a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train, y_train)

# Step 8: Make predictions and evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix

# Make predictions on the test set
y_pred = pac.predict(X_test)

# Evaluate the accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)

Sample of the dataset:
            ID                                              TITLE  \
329850  330310                        Gasoline prices move higher   
177017  177353  Eta Aquarid Meteor Shower 2014: May 5-6 Viewin...   
415340  415859  Usher Teams with Nicki Minaj, Performs at MTV ...   
119874  120210  International Aid Providers Join Hands To Stop...   
134275  134611  Chuck D From Public Enemy Talks About Being Th...   

                                                      URL  \
329850  http://www.mohavedailynews.com/news/briefs/art...   
177017  http://www.theepochtimes.com/n3/658218-eta-aqu...   
415340  http://www.thehollywoodgossip.com/2014/08/ushe...   
119874  http://www.newsonwellness.com/2014/04/internat...   
134275  http://www.vh1.com/music/tuner/2014-04-17/chuc...   

                   PUBLISHER CATEGORY                          STORY  \
329850    Mohave Valley News        b  dr2nTauJSiV8XjMgFMRndoKxt2NmM   
177017       The Epoch Times        t  dN-BY7zHZL7F