In [None]:
"""
【Introduction】
Topic model aims at discover the uncovering structure in a collection of texts. 
In project, we would experiment with documents released on using keyword research 
"véhicules électriques". The corpus was scrapped from the the French government 
website. 

This topic modeling will be conducted with Lantent Dirichlet Allocation (LDA)
method using the sklearn library in python Colab editor.



LDA Implementation：

1. Loading data
2. Data cleaning
3. Exploratory analysis
4. Preparing data for LDA analysis
5. Model Evaluation and parameter tuning

"""


In [25]:
"""
Importing data and preprocessing
"""

import pandas as pd
import numpy as np
import os

with open(r'C:\Users\adele\Documents\GitHub\Political-Motivation-behind-French-EV-Promotion\Dataset\test.csv') as evarticles:
    evarticles_list = list(evarticles)


In [27]:
"""
【2. Data cleaning】
"""


# Load the regular expression library
import re

#remove numbers, excessive white space, and /n 
def replacing(text):
  space_pattern = '\s+'
  giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
  mention_regex = '@[\w\-]+'
  replaced_text = re.sub(space_pattern, ' ', text)
  replaced_text = re.sub(giant_url_regex, '', replaced_text)
  replaced_text = re.sub('\n', '', replaced_text)
  return replaced_text

evarticles_replaced = evarticles_list.apply(replacing)

# Print out the first rows of papers
evarticles_replaced.head()

AttributeError: 'list' object has no attribute 'apply'

In [28]:
"""
【3. Exploratory Analysis: Word Cloud】
"""
# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
long_string = ','.join(list(evarticles.values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()


AttributeError: '_io.TextIOWrapper' object has no attribute 'values'

In [None]:
"""
【4. Prepare text for LDA analysis】

"""

# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(papers['paper_text_processed'])

# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)