# Match TOPICS

- using LDA to predict the topic of a text

# Process the topic data

### Preprocessing the data

- *get the topic data*

In [1]:
import pandas as pd

In [2]:
with open("list_of_edits.txt",encoding="utf-8") as topic_file:
    topics = [each for each in topic_file.readlines()]
    print(topics)

['"Polish death camp" controversy\n', "'Ubadah ibn al-Samit\n", '116th United States Congress\n', '13 Reasons Why\n', '1721 Boston smallpox outbreak\n', '1900\n', '1917 (2019 film)\n', '1993 NCAA Division I Outdoor Track and Field Championships\n', '1999 Matamoros standoff\n', '1999 NCAA Division I Outdoor Track and Field Championships\n', '19 Kids and Counting\n', '2.0 (film)\n', '2000 NCAA Division I Outdoor Track and Field Championships\n', '2002 Stromboli tsunami\n', '2007 NCAA Division I Indoor Track and Field Championships\n', '2008 NCAA Division I Indoor Track and Field Championships\n', '2010 in country music\n', '2011 World Masters Athletics Championships\n', '2013 local electoral calendar\n', '2016 AFF Championship\n', '2016 AFL season\n', '2016 ATP Challenger Tour\n', '2016 American League Championship Series\n', '2016 American League Division Series\n', '2016 Atlantic hurricane season\n', '2016 Berlin truck attack\n', '2016 Chicago Cubs season\n', '2016 China Open Super Ser

In [3]:
topics_df = pd.DataFrame(topics)

In [4]:
topics_df = topics_df.rename(columns={0:"topics"})

In [5]:
topics_df = topics_df["topics"].apply(lambda x: x[:-1])

In [6]:
topics_df = pd.DataFrame(topics_df)

In [7]:
topics_df

Unnamed: 0,topics
0,"""Polish death camp"" controversy"
1,'Ubadah ibn al-Samit
2,116th United States Congress
3,13 Reasons Why
4,1721 Boston smallpox outbreak
...,...
3478,Zoom Video Communications
3479,Ædes Danielis
3480,Église Saint-Patern de Vannes
3481,÷ (album)


- *remove punctuation*

In [8]:
import string

In [9]:
def remov_punc(data):
    for each in string.punctuation:
        data["topics"] = data["topics"].apply(lambda x: x.replace(each,' '))
    return data

In [10]:
cleaning_data = remov_punc(topics_df)
cleaning_data

Unnamed: 0,topics
0,Polish death camp controversy
1,Ubadah ibn al Samit
2,116th United States Congress
3,13 Reasons Why
4,1721 Boston smallpox outbreak
...,...
3478,Zoom Video Communications
3479,Ædes Danielis
3480,Église Saint Patern de Vannes
3481,÷ album


- *remove special punctuation*

In [11]:
from nltk.tokenize import word_tokenize

In [12]:
cleaning_data.loc[3481,"topics"][0].isalnum()

False

In [13]:
cleaning_data.loc[3481,"topics"][3].isalnum()

True

In [14]:
cleaning_data.loc[3481,"topics"][1].isalnum()

False

In [15]:
def remove_special_punc(data):
    index = 0
    for each_topic in data["content"].apply(lambda x: word_tokenize(x)):
        topic = []
        for each_word in each_topic:
            word = ""
            for each_letter in each_word:
                if each_letter.isalnum():
                    word += each_letter
            topic.append(word)
        topic_sentence = ' '.join(topic)
        data.loc[index:"content"] = topic_sentence
        index += 1
    return data

In [16]:
remove_special_punc(cleaning_data)

Unnamed: 0,topics
0,Polish death camp controversy
1,Ubadah ibn al Samit
2,116th United States Congress
3,13 Reasons Why
4,1721 Boston smallpox outbreak
...,...
3478,Zoom Video Communications
3479,Ædes Danielis
3480,Église Saint Patern de Vannes
3481,album


- *Stop word*

In [17]:
from nltk.corpus import stopwords 

In [18]:
def remove_stopword(data):
    stop_words = set(stopwords.words('english'))
    index = 0
    for each_words in data["topics"].apply(lambda x: word_tokenize(x)):
        words = [w for w in each_words if not w in stop_words]
        data.loc[index,"topics"] = " ".join(words)
        index += 1
    return data

In [19]:
cleaning_data = remove_stopword(cleaning_data)

In [20]:
cleaning_data

Unnamed: 0,topics
0,Polish death camp controversy
1,Ubadah ibn al Samit
2,116th United States Congress
3,13 Reasons Why
4,1721 Boston smallpox outbreak
...,...
3478,Zoom Video Communications
3479,Ædes Danielis
3480,Église Saint Patern de Vannes
3481,album


- *Lower case*

In [21]:
def lower_case(data):
    data["topics"] = data["topics"].apply(lambda x: x.lower())
    return data

In [22]:
cleaning_data = lower_case(cleaning_data)

In [23]:
cleaning_data

Unnamed: 0,topics
0,polish death camp controversy
1,ubadah ibn al samit
2,116th united states congress
3,13 reasons why
4,1721 boston smallpox outbreak
...,...
3478,zoom video communications
3479,ædes danielis
3480,église saint patern de vannes
3481,album


- *Lemmatize*

In [24]:
from nltk.stem import WordNetLemmatizer

In [25]:
def lemmatize_word(data):
    index = 0
    lemmatizer = WordNetLemmatizer()
    for each_text in data["topics"].apply(lambda x: word_tokenize(x)):
        words = [w for w in each_text]
        lemmatized = [lemmatizer.lemmatize(word) for word in words]
        data.loc[index,"topics"] = " ".join(lemmatized)
        index += 1
    return data

In [26]:
cleaning_data = lemmatize_word(cleaning_data)

In [27]:
cleaning_data

Unnamed: 0,topics
0,polish death camp controversy
1,ubadah ibn al samit
2,116th united state congress
3,13 reason why
4,1721 boston smallpox outbreak
...,...
3478,zoom video communication
3479,ædes danielis
3480,église saint patern de vannes
3481,album


# Summarization modeling

- We use a already done model to summarize the input of a user

MODEL ARTICLE

**[article of a model](https://arxiv.org/pdf/1704.04368.pdf)**

**[model file in github](https://github.com/abisee/pointer-generator)**

#### hard to use ---- so we decide to use the API & library

- USE [API](https://rapidapi.com/MeaningCloud/api/summarization/endpoints)

In [43]:
import requests

url = "https://meaningcloud-summarization-v1.p.rapidapi.com/summarization-1.0"

text = """Great WIN Anna! Watched your race from beginning, very impressed. Your next opponent, Charlie Crist, is a Pelosi puppet who is bad on Crime, our Military, Vets, & 2nd Amendment. You have my Complete & Total Endorsement!"""

querystring = {"txt":text,"sentences":"1"}

headers = {
    'x-rapidapi-host': "meaningcloud-summarization-v1.p.rapidapi.com",
    'x-rapidapi-key': "d3d60bd862msh7cc072c3677bc5dp1c6d28jsn537366b4af8b",
    'accept': "application/json"
    }

response = requests.get(url,headers=headers,params=querystring).json()

In [44]:
response

{'status': {'code': '0',
  'msg': 'OK',
  'credits': '1',
  'remaining_credits': '678262'},
 'summary': 'Your next opponent, Charlie Crist, is a Pelosi puppet who is bad on Crime, our Military, Vets, & 2nd Amendment.'}

- USE [bert-extractive-summarizer](https://pypi.org/project/bert-extractive-summarizer/)

In [28]:
#!pip install bert-extractive-summarizer

In [45]:
from summarizer import Summarizer

NameError: name 'BertModel' is not defined

In [None]:
model = Summarizer()


# Match the reusult of the summarization with topic list