In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

In [3]:
column_names = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv("C:/Kaggle/training.1600000.processed.noemoticon.csv", encoding="latin-1", names=column_names)
print(df.head())
print(df["target"].value_counts())

   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  
target
0    800000
4    800000
Name: count, dtype: int64


In [5]:
import re
def clean_text(text):
 text = re.sub(r"http\S+", "", text) # URL 除去
 text = re.sub(r"@\w+", "", text) # メンション除去
 text = re.sub(r"[^a-zA-Z\s]", "", text) # 記号除去
 return text.lower() # 小文字化

In [7]:
df["clean_text"] = df["text"].apply(clean_text)
print(df["clean_text"].head())

0       awww thats a bummer  you shoulda got david ...
1    is upset that he cant update his facebook by t...
2     i dived many times for the ball managed to sa...
3      my whole body feels itchy and like its on fire 
4     no its not behaving at all im mad why am i he...
Name: clean_text, dtype: object


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
# ベクトル化（上位 5000 語に限定、英語ストップワード除外）
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df["clean_text"])
print(X.shape)

(1600000, 5000)


In [10]:
from sklearn.decomposition import NMF
n_topics = 10

In [11]:
# NMF モデルの定義と学習
nmf = NMF(n_components=n_topics, random_state=0)
W = nmf.fit_transform(X)
H = nmf.components_
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(H):
 top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
 print(f"Topic {topic_idx + 1}: {' | '.join(top_words)}")

Topic 1: just | got | home | lol | woke | new | twitter | time | ive | finished
Topic 2: good | morning | night | luck | thats | hope | time | sounds | feeling | feel
Topic 3: im | sorry | gonna | tired | sad | sure | lol | bored | sick | right
Topic 4: thanks | follow | following | lol | great | followfriday | hey | haha | ill | ff
Topic 5: work | tomorrow | today | home | ready | time | getting | doesnt | hours | didnt
Topic 6: love | song | new | thank | haha | lt | lol | oh | watching | guys
Topic 7: dont | know | want | really | think | lol | feel | wanna | let | need
Topic 8: day | today | happy | great | mothers | hope | nice | tomorrow | school | long
Topic 9: miss | really | gonna | home | ill | come | friends | guys | baby | days
Topic 10: going | like | today | time | feel | bed | night | sleep | tomorrow | wish
