In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r'D:\Study\Project\NLP\CNN_Articels_clean\CNN_Articels_clean.csv')

In [3]:
df.head()

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d..."
1,2,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...
2,3,"Words by Stephanie Bailey, video by Zahra Jamshed",2021-06-16 02:51:30,news,asia,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,"In a Hong Kong warehouse, a swarm of autonomou...","asia, This swarm of robots gets smarter the mo...",This swarm of robots gets smarter the more it ...,"(CNN)In a Hong Kong warehouse, a swarm of aut..."
3,4,"Paul R. La Monica, CNN Business",2022-03-15 09:57:36,business,investing,https://www.cnn.com/2022/03/15/investing/brics...,Russia is no longer an option for investors. T...,"For many years, the world's most popular emerg...","investing, Russia is no longer an option for i...",Russia is no longer an option for investors. T...,"New York (CNN Business)For many years, the wor..."
4,7,Reuters,2022-03-15 11:27:02,business,business,https://www.cnn.com/2022/03/15/business/russia...,Russian energy investment ban part of new EU s...,The European Union formally approved on Tuesda...,"business, Russian energy investment ban part o...",EU bans investment in Russian energy in new sa...,The European Union formally approved on Tuesda...


In [4]:
df.columns

Index(['Index', 'Author', 'Date published', 'Category', 'Section', 'Url',
       'Headline', 'Description', 'Keywords', 'Second headline',
       'Article text'],
      dtype='object')

In [5]:
df.shape

(4076, 11)

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    
    return tokens

# Apply the preprocessing function to the 'text_column' in the DataFrame
df['preprocessed_article_text'] = df['Article text'].apply(preprocess_text)

# Example usage:
print(df['preprocessed_article_text'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nishi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0       [, cnn, , right, , shortage, truck, driver, u,...
1       [, cnn, , working, factory, warehouse, mean, t...
2       [, cnn, , hong, kong, warehouse, , swarm, auto...
3       [new, york, , cnn, business, , many, year, , w...
4       [european, union, formally, approved, tuesday,...
                              ...                        
4071    [, reuters, , australian, open, covid19, vacci...
4072    [, cnn, , four, golfer, scheduled, play, south...
4073    [, cnn, , longtime, international, olympic, co...
4074    [norwegian, technology, company, found, way, s...
4075    [story, excerpted, november, 23, edition, cnn,...
Name: preprocessed_article_text, Length: 4076, dtype: object


In [7]:
df['preprocessed_article_text']

0       [, cnn, , right, , shortage, truck, driver, u,...
1       [, cnn, , working, factory, warehouse, mean, t...
2       [, cnn, , hong, kong, warehouse, , swarm, auto...
3       [new, york, , cnn, business, , many, year, , w...
4       [european, union, formally, approved, tuesday,...
                              ...                        
4071    [, reuters, , australian, open, covid19, vacci...
4072    [, cnn, , four, golfer, scheduled, play, south...
4073    [, cnn, , longtime, international, olympic, co...
4074    [norwegian, technology, company, found, way, s...
4075    [story, excerpted, november, 23, edition, cnn,...
Name: preprocessed_article_text, Length: 4076, dtype: object

In [8]:
from gensim import corpora, models

def train_lda_model(documents, num_topics=30):
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Train LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    
    return lda_model, dictionary

# Example usage:
# Assuming 'df' is your DataFrame and 'preprocessed_text' is the column containing preprocessed documents
documents = df['preprocessed_article_text'].tolist()
lda_model, dictionary = train_lda_model(documents)

In [9]:
def infer_topics(lda_model, dictionary, text):
    preprocessed_text = preprocess_text(text)
    bow_vector = dictionary.doc2bow(preprocessed_text)
    topics = lda_model.get_document_topics(bow_vector)
    
    return topics

# # Example usage:
# text = "China has announced plans to launch a new space mission to explore Mars. The mission, named Tianwen-1, will include an orbiter, a lander, and a rover. This marks China's latest step in its ambitious space exploration program. The mission aims to search for signs of past life and assess the planet's geology."
# topics = infer_topics(lda_model, dictionary, text)
# print(topics)


In [12]:
df['Article text'][0]

' (CNN)Right now, there\'s a shortage of truck drivers in the US and worldwide, exacerbated by the e-commerce boom brought on by the pandemic. One solution to the problem is autonomous trucks, and several companies are in a race to be the first to launch one. Among them is San Diego-based TuSimple.Founded in 2015, TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the US, China and Europe. Although these are simply commercially available trucks retrofitted with its technology, TuSimple has deals in place with two of the world\'s largest truck manufacturers -- Navistar in the US and Traton, Volkswagen\'s trucking business, in Europe -- to design and build fully autonomous models, which it hopes to launch by 2024. Photos: The Yara Birkeland is what its builders call the world\'s first zero-emission, autonomous cargo ship. The ship is scheduled to make its first journey between two Norwegian towns before the end of the year. Click through to see

In [22]:
new_df = df

In [26]:
new_df.columns

Index(['Index', 'Author', 'Date published', 'Category', 'Section', 'Url',
       'Headline', 'Description', 'Keywords', 'Second headline',
       'Article text', 'preprocessed_article_text'],
      dtype='object')

In [31]:
new_df = new_df.drop(columns=['Index', 'Author', 'Date published', 'Category', 'Section', 'Url',
       'Headline', 'Description', 'Keywords', 'Second headline',
       'preprocessed_article_text'])

In [29]:
new_df.drop(df.index[3:], inplace=True)


In [32]:
new_df.head()

Unnamed: 0,Article text
0,"(CNN)Right now, there's a shortage of truck d..."
1,(CNN)Working in a factory or warehouse can me...
2,"(CNN)In a Hong Kong warehouse, a swarm of aut..."


In [36]:
new_df['Article text'][2]

' (CNN)In a Hong Kong warehouse, a swarm of autonomous robots works 24/7. They\'re not just working hard, they\'re working smart; as they operate, they get better at their job.The Autonomous Mobile Robots were developed by Chinese startup Geek+. As they move around the warehouse they\'re guided by QR codes on the floor, and using AI they are able to make their own decisions, including what direction to travel and what route to take to their destination.  Photos: The robots running our warehousesRobots are an increasingly familiar presence in warehouses. At the south-east London warehouse run by British online supermarket Ocado, 3,000 robots fulfill shopping orders. When an order is sent to the warehouse, the bots spring to life and head towards the container they require. Scroll through to see more robots that are revolutionizing warehouses.Hide Caption 1 of 8 Photos: The robots running our warehousesIn response to the coronavirus pandemic, MIT collaborated with Ava Robotics and the Gr

In [33]:
new_df.to_pickle('demo.pkl')

In [13]:
df['Headline'][0]

"There's a shortage of truckers, but TuSimple thinks it has a solution: no driver needed - CNN"

In [43]:
file_path = "demo2.txt"  # Replace "example.txt" with the path to your text file

# Open the file in read mode
with open(file_path, "r") as file:
    # Read the entire contents of the file
    text = file.read()

In [44]:
print(text)

'(CNN)Right now, there\'s a shortage of truck drivers in the US and worldwide, exacerbated by the e-commerce boom brought on by the pandemic. One solution to the problem is autonomous trucks, and several companies are in a race to be the first to launch one. Among them is San Diego-based TuSimple.Founded in 2015, TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the US, China and Europe. Although these are simply commercially available trucks retrofitted with its technology, TuSimple has deals in place with two of the world\'s largest truck manufacturers -- Navistar in the US and Traton, Volkswagen\'s trucking business, in Europe -- to design and build fully autonomous models, which it hopes to launch by 2024. Photos: The Yara Birkeland is what its builders call the world\'s first zero-emission, autonomous cargo ship. The ship is scheduled to make its first journey between two Norwegian towns before the end of the year. Click through to see 

In [48]:
import pickle
pickle_file_path = "text_data2.pickle"  # Replace "text_data.pkl" with the desired name of your pickle file

with open(pickle_file_path, "wb") as pickle_file:
    pickle.dump(text2, pickle_file)

In [46]:
with open('text_data2.pkl', "r") as file:
    # Read the entire contents of the file
    text2 = file.read()

In [47]:
text2

'€\x04•|0\x00\x00\x00\x00\x00\x00Xu0\x00\x00\'(CNN)Right now, there\\\'s a shortage of truck drivers in the US and worldwide, exacerbated by the e-commerce boom brought on by the pandemic. One solution to the problem is autonomous trucks, and several companies are in a race to be the first to launch one. Among them is San Diego-based TuSimple.Founded in 2015, TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the US, China and Europe. Although these are simply commercially available trucks retrofitted with its technology, TuSimple has deals in place with two of the world\\\'s largest truck manufacturers -- Navistar in the US and Traton, Volkswagen\\\'s trucking business, in Europe -- to design and build fully autonomous models, which it hopes to launch by 2024. Photos: The Yara Birkeland is what its builders call the world\\\'s first zero-emission, autonomous cargo ship. The ship is scheduled to make its first journey between two Norwegian to

In [11]:
def generate_headline_dynamic(topics, dictionary, lda_model, num_keywords=3):
    headline = ""
    for topic, _ in sorted(topics, key=lambda x: x[1], reverse=True)[:num_keywords]:
        topic_keywords = lda_model.show_topic(topic, topn=num_keywords)
        keywords = [word for word, _ in topic_keywords]
        headline += " ".join(keywords) + " "
    
    return headline.strip()

# Example usage:
# text = "China has announced plans to launch a new space mission to explore Mars. The mission, named Tianwen-1, will include an orbiter, a lander, and a rover. This marks China's latest step in its ambitious space exploration program. The mission aims to search for signs of past life and assess the planet's geology."
text = "What is your"
topics = infer_topics(lda_model, dictionary, text)
headline = generate_headline_dynamic(topics, dictionary, lda_model)
print(headline)


said year  said photo  caption photo


In [21]:
print(topics)

[(2, 0.75129306), (4, 0.23330176)]
