# Imports

In [1]:
#IMPORTS
########
import pandas as pd
import requests

import mysql.connector
import tpclean.tpclean as tp

from sqlalchemy import create_engine

#custom imports
from Scripts.config import role, bucket_name, prefix, bucket_path, sub_path

#establish connection
from Scripts.config import host, db
from Private.private import user , password

# Load Data

## establish connection

In [2]:
#load credentials
conn_kwargs = {"host":host, 
               "user":user, 
               "password":password}
conn = tp.sql_connect(db,db_type="mysql",**conn_kwargs)
#conn = mysql.connector.Connect(database = db, **conn_kwargs)
c = conn.cursor()

#connecting via sqlalchemy because pandas needs an engine to store data in an mysql DB
engine = create_engine(f'mysql+pymysql://{user}:{password}@{conn_kwargs["host"]}:3306/{db}')

Connecting to mysql
successfully imported module
Connection to mysql successfull. with curser MySQLCursor: (Nothing executed yet)


## SQL Querries

In [3]:
tp.sql("""show tables""")

Unnamed: 0,Tables_in_debater
0,content
1,conversations
2,test


In [4]:
words_df = tp.sql("""SELECT origin
FROM content
GROUP BY origin
HAVING COUNT(freq) = 0""")

In [6]:
job_list = list(words_df.iloc[:,0])
file = job_list[0]

In [8]:
file

'AM_101_affirmativeaction_pro.wav'

In [14]:
text_df = tp.sql(f"""SELECT full_text FROM conversations WHERE filename = "{file}" """)
text = text_df["full_text"][0]

# Feature Engineering

In [15]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk import FreqDist
from nltk.corpus import stopwords

In [None]:
file_df = tp.sql(f"""SELECT pos_in_conv,
                    LOWER(content) as content
                    FROM content
                    WHERE type = "pronunciation" and origin = "{file}" """)

In [None]:
tokens = np.asarray(file_df.iloc[:,1]).reshape(len(file_df),)

In [None]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text) 

## Remove Stopwords and Lemmatize

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

stopword_list = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [None]:
#define function for stopping and lemmatizing
stoplem = lambda x: np.NaN if x in stopword_list else lemmatizer.lemmatize(x)
#remove stoppwords and lemmatize ramaining words
file_df["lemmatized"] = file_df["content"].apply(stoplem)

In [None]:
#create freqency dict
fDist_lemm = FreqDist(file_df["lemmatized"].dropna(),)

In [None]:
#append frequencies to lemmatized words
freqs = pd.DataFrame.from_dict(fDist_lemm,orient = "index", columns = ["freq"])
file_df = pd.merge(file_df,freqs, how = "left", left_on = "lemmatized", right_index = True)

## Vizualisation

+ Upload Freqdists to new SQL Table
+ TODO : Make Wordclouds in Tableau

In [None]:
from wordcloud import WordCloud

In [None]:
def generate_wordcloud(text, **kwargs): # optionally add: stopwords=STOPWORDS and change the arg below
    wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
                          random_state = 42,
                          background_color = "white",
                          width=800,
                          height=400,
                          scale = 1,
                          max_words = 15,
                          relative_scaling = 1.0, # set or space-separated string
                          **kwargs
                          ).generate_from_frequencies(text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

In [None]:
#lemmatized wordlcloud
generate_wordcloud(fDist_lemm)

## Bigrams

In [None]:
file_df.head()

In [None]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [None]:
finder = BigramCollocationFinder.from_words(file_df["lemmatized"].dropna())

In [None]:
finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

In [None]:
bigrams_fd = finder.ngram_fd

In [None]:
def bigram_to_single_word(bigrams_fd):
    out = dict(zip([x[0]+" "+x[1] for x in bigrams_fd.keys()],bigrams_fd.values()))
    return out

In [None]:
testfd = bigram_to_single_word(bigrams_fd)
generate_wordcloud(testfd)

# Sentiment analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [None]:
sc = lambda x: sia.polarity_scores(x)["compound"]
file_df["sentiment_score"] = file_df["lemmatized"].dropna().apply(sc)
file_df["weight"] = file_df["freq"]*file_df["sentiment_score"]

In [None]:
file_df.head(25)

## Load Data to DB

In [None]:
file_df.dropna().head(10)

In [None]:
#update DB
updates = file_df.dropna()
for i in range(len(updates)):
    row = updates.iloc[i]
    out =[f'{key} = "{row[key]}"' for key in row.keys()[1:]]
    set_ = ", ".join(out)
    querry = f"""UPDATE content
                 SET {set_}
                 WHERE pos_in_conv = {row['pos_in_conv']}
                 AND origin = "{file}"
             """
    c.execute(querry)
conn.commit()
print("Inserted frequencydict in content")

In [None]:
file

# Summarizing the Text

To Summarize the Text an external API called Aylien is used.

In [None]:
from aylienapiclient import textapi
from Private.private import aylien_app_id,aylien_API_KEY
from Scripts.config import cfg_summary_len ,cfg_summary_lang

In [None]:
aylien = textapi.Client(aylien_app_id,aylien_API_KEY)

In [None]:
summary = aylien.Summarize({"title": file, 
                  "text": text,
                 "sentences_number": cfg_summary_len, 
                 "language": cfg_summary_lang })

In [None]:
summary_text = " ".join(summary["sentences"])

In [None]:
c.execute(f"""UPDATE conversations
            SET summary = '{summary_text}'
            WHERE filename = '{file}'""")
conn.commit()

# Make ready for export

In [None]:
conn.close()