<a href="https://colab.research.google.com/github/Ramadhan1212/Text-Mining-Fast-Edu/blob/main/Text_Mining_Digital_Currency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Text Mining**

### **Install and Load Packages**

In [1]:
# Install packages
! pip install tweet-preprocessor
! pip install pyLDAvis

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 20.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=1bdf7f1cc1360569ec8d97a180c283832de50d4ec479e6c12f00fe57b10ced2e
  Stored in directory: /root/.cache/pip/wheels/c9/21/f6/17bcf2667e8a68532ba2fbf6d5c72fdf4c7f7d9abfa4852d2f
Successfully

In [2]:
# Load packages
import pandas as pd
import matplotlib.pyplot as plt
import preprocessor as p
import numpy as np
import networkx as nx
import wordcloud
import nltk
import warnings
import itertools
import re
import os
import random
import pyLDAvis
import pyLDAvis.sklearn 

  from collections import Iterable
  from collections import Mapping


In [3]:
# Import module
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
from tqdm import tqdm
from nltk import bigrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation



In [4]:
# Set parameter
warnings.filterwarnings('ignore')
pyLDAvis.enable_notebook()

### **Import Data**

In [5]:
# Import data
df = pd.read_csv("https://raw.githubusercontent.com/andrybrew/bi-8-maret/main/data/tweet-full.csv")

In [6]:
# Lihat 5 baris pertama data
df.head(5)

Unnamed: 0,created_at,screen_name,text
0,2020-12-30,PapaPapag1954,@LewisECFC @Kes1977 @herotroyippygod @GlobeSen...
1,2020-12-30,HsarafattoHogan,"@davidmcw Come on David, tell me billionaires ..."
2,2020-12-30,hunter_1st,@PeterSchiff Chinas digital currency could bri...
3,2020-03-26,hunter_1st,@Bipoker @woofBIGDAWG @PeterSchiff Bitcoin is ...
4,2020-12-30,Deanmufc77,@ecb @finanzfluss And people still call the NW...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26768 entries, 0 to 26767
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   created_at   26768 non-null  object
 1   screen_name  26768 non-null  object
 2   text         26768 non-null  object
dtypes: object(3)
memory usage: 627.5+ KB


### **Text Preprocessing**

In [None]:
# Pilih 5 kolom teks saja
tweet = df[['text']]

# Lihat 5 baris pertama data
tweet.head()

### **Transformation**

In [None]:
# Membuat fungsi transformasi tweet
def transform_tweet(row):
  tweet = row['text']
  tweet = p.clean(tweet)
  tweet = str.lower(tweet)
  return tweet

In [None]:
# Mengaplikasikan fungsi transofrmasi
tweet['transformed'] = tweet.apply(transform_tweet, axis=1)

# Lihat 5 baris pertama data
tweet.head(5)

### **Tokenization**

In [None]:
# Download Punkt
nltk.download('punkt')

In [None]:
# Membuat fungsi tokenization
def tokenize_tweet(row):
    tweet = row['transformed']
    tokens = nltk.word_tokenize(tweet)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

In [None]:
# Mengaplikasikan fungsi tokenization
tweet['tokenized'] = tweet.apply(tokenize_tweet, axis=1)

# Lihat 5 baris pertama data
tweet.head(5)

### **Lemmatization**

In [None]:
# Download Wordnet
nltk.download('wordnet')

In [None]:
# Membuat fungsi lemmatization
def lemmatize_tweet(row):
    list = row['tokenized']
    lemmatize_list = [WordNetLemmatizer().lemmatize(w, pos='v') for w in list]
    return(lemmatize_list)

In [None]:
# Mengaplikasikan fungsi lemmatization
tweet['lemmatized'] = tweet.apply(lemmatize_tweet, axis=1)

# Lihat 5 baris pertama data
tweet.head(5)

### **Stopword Removal**

In [None]:
# Download stopword bahasa inggris
nltk.download('stopwords')
stops = set(stopwords.words("english"))     

In [None]:
# Membuat fungsi lemmatization
def stopword_tweet(row):
    list = row['lemmatized']
    stopword_list = [w for w in list if not w in stops]
    return(stopword_list)

In [None]:
# Mengaplikasikan fungsi Stopword
tweet['stopword'] = tweet.apply(stopword_tweet, axis=1)

# Lihat 5 baris pertama data
tweet.head(5)

### **Rejoin**

In [None]:
# Membuat fungsi rejoin
def rejoin_tweet(row):
    list = row['stopword']
    joined_words = ( " ".join(list))
    return joined_words

In [None]:
# Mengaplikasikan fungsi rejoin
tweet['final'] = tweet.apply(rejoin_tweet, axis=1)

# Lihat 5 baris pertama data
tweet.head(5)

In [None]:
# Final tweet yang sudah di proses
tweet_clean = tweet[['final']]
tweet_clean = tweet_clean.rename(columns={'final': 'text'})

# Lihat 5 baris pertama data
tweet_clean.head(5)

### **Wordcloud**

In [None]:
# Visualisasi Word Cloud
text_wordcloud = " ".join(tweet for tweet in tweet_clean.text)

cloud = WordCloud(background_color='white').generate(text_wordcloud)

plt.figure(figsize=(10, 10), facecolor=None)
plt.imshow(cloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

### **Sentiment Analysis**

In [None]:
# Download corpus untuk sentiment analysis
nltk.download('vader_lexicon')

In [None]:
# Sentiment Analysis
sid = SentimentIntensityAnalyzer()
listy = [] 
for index, row in tweet_clean.iterrows():
  ss = sid.polarity_scores(row['text'])
  listy.append(ss)
  
se = pd.Series(listy)
tweet_clean['polarity'] = se.values
display(tweet_clean.head(5))

In [None]:
# Visualisasi Pie Chart
labels = ['negative', 'neutral', 'positive']
sizes  = [ss['neg'], ss['neu'], ss['pos']]
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.axis('equal') 
plt.show()

### **Topic Modelling**

In [None]:
# clone tambahan library dari github
! git clone https://github.com/machine-learning-ss/tm

# Set Data Directory
os.chdir('tm')

In [None]:
import MyLib as TS

Tweets = tweet_clean['text']
print('Total loaded tweets = {0}'.format(len(Tweets)))

In [None]:
n_topics = 4
top_topics = 4
top_words = 10

In [None]:
# Feature Extraction
count_vector = CountVectorizer(token_pattern = r'\b[a-zA-Z]{3,}\b') 
dtm_tf = count_vector.fit_transform(Tweets)
tf_terms = count_vector.get_feature_names()

In [None]:
# Fungsi untuk mencari topic
lda_tf = LatentDirichletAllocation(n_components=n_topics, learning_method='online', random_state=0).fit(dtm_tf)

# Menampilkan Topik
vsm_topics = lda_tf.transform(dtm_tf); doc_topic =  [a.argmax()+1 for a in tqdm(vsm_topics)] # topic of docs
print('In total there are {0} major topics, distributed as follows'.format(len(set(doc_topic))))
plt.hist(np.array(doc_topic), alpha=0.5); plt.show()
print('Printing top {0} Topics, with top {1} Words:'.format(top_topics, top_words))
TS.print_Topics(lda_tf, tf_terms, top_topics, top_words)

In [None]:
# Visualisasi Topic Secara Interaktif
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, count_vector) 

### **Text Network Analysis**

In [None]:
# Pilih teks
text = tweet_clean['text']
text

In [None]:
# Tokenize
text_data = [word_tokenize(i) for i in text]
print(text_data)

In [None]:
# Membuat fungsi cooccurence
def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}

    bi_grams = list(bigrams(corpus))
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
 
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
 
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
 
    return co_occurrence_matrix, vocab_index

In [None]:
# Membuat Adjacency Matrix
data = list(itertools.chain.from_iterable(text_data))
matrix, vocab_index = generate_co_occurrence_matrix(data)
 
 
data_matrix = pd.DataFrame(matrix, index=vocab_index,
                             columns=vocab_index)

# Show Adjacency Matrix
data_matrix.head()

In [None]:
# Membuat Network dengan Adjacency Matrix
G = nx.from_pandas_adjacency(data_matrix)

In [None]:
# Degree Centrality
degree = nx.degree_centrality(G)

list_node = list(degree) 
selected_node = list_node[1:600]

In [None]:
sampled_graph = G.subgraph(selected_node)

plt.figure(figsize=(10, 10), facecolor=None)
nx.draw(sampled_graph, with_labels=True, 
        node_color='skyblue', node_size=600, 
        arrowstyle='->',arrowsize=20, edge_color='r',
        font_size=7,
        pos=nx.kamada_kawai_layout(sampled_graph))