In [2]:
import pandas as pd
df = pd.read_csv('news-article-categories.csv')

In [3]:
df.drop_duplicates(inplace=True)

In [4]:
df.head()

Unnamed: 0,category,title,body
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis..."
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds..."
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,There’s something about combining the traditio...


In [5]:
df['category'].unique()

array(['ARTS & CULTURE', 'BUSINESS', 'COMEDY', 'CRIME', 'EDUCATION',
       'ENTERTAINMENT', 'ENVIRONMENT', 'MEDIA', 'POLITICS', 'RELIGION',
       'SCIENCE', 'SPORTS', 'TECH', 'WOMEN'], dtype=object)

In [6]:
df.dropna(inplace=True)


In [8]:
df.drop("title",axis='columns')

Unnamed: 0,category,body
0,ARTS & CULTURE,"In October 2017, Carolyn Kramer received a dis..."
1,ARTS & CULTURE,This week I talked with actor Jeff Hiller abou...
2,ARTS & CULTURE,The New Yorker is taking on President Donald T...
3,ARTS & CULTURE,"Kellen Hickey, a 26-year-old who lives in Huds..."
4,ARTS & CULTURE,There’s something about combining the traditio...
...,...,...
6872,WOMEN,I still think about that Tuesday night dinner ...
6873,WOMEN,I remember the morning of my high school gradu...
6874,WOMEN,"My husband, Gene, doesn't wear pajamas. I aske..."
6875,WOMEN,"\nBy AntonioGuillem, via ThinkStock\nBy Lisa ..."


In [9]:
import re

def clean(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text.strip()

df['clean_body'] = df['body'].apply(clean)

In [10]:
df.head()

Unnamed: 0,category,title,body,clean_body
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis...",in october 2017 carolyn kramer received a dist...
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...,this week i talked with actor jeff hiller abou...
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...,the new yorker is taking on president donald t...
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds...",kellen hickey a 26yearold who lives in hudson ...
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,There’s something about combining the traditio...,theres something about combining the tradition...


In [11]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

df['clean_body'] = df['clean_body'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [12]:
df.head()

Unnamed: 0,category,title,body,clean_body
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis...",october 2017 carolyn kramer received disturbin...
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...,week talked actor jeff hiller hit broadway pla...
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...,new yorker taking president donald trump asked...
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds...",kellen hickey 26yearold lives hudson wisconsin...
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,There’s something about combining the traditio...,theres something combining traditional uptight...


In [13]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

df['clean_body'] = df['clean_body'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\praga\AppData\Roaming\nltk_data...


In [14]:
final_df = df[['category','clean_body']]

In [15]:
final_df.head()

Unnamed: 0,category,clean_body
0,ARTS & CULTURE,october 2017 carolyn kramer received disturbin...
1,ARTS & CULTURE,week talked actor jeff hiller hit broadway pla...
2,ARTS & CULTURE,new yorker taking president donald trump asked...
3,ARTS & CULTURE,kellen hickey 26yearold life hudson wisconsin ...
4,ARTS & CULTURE,there something combining traditional uptight ...


In [16]:
final_df.to_csv("news.csv")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [18]:
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2))  # unigrams + bigrams
X = vectorizer.fit_transform(final_df['clean_body'])

In [19]:
from sklearn.model_selection import train_test_split

y = final_df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Define model
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Binary output

# Compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train with multiple epochs
history = model.fit(X_train.toarray(), y_train, epochs=5, batch_size=32,
                    validation_data=(X_test.toarray(), y_test),
                    callbacks=[EarlyStopping(patience=3, restore_best_weights=True)])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Invalid dtype: object

Accuracy: 0.799853907962016
