# Transcript Analysis

This cell imports all the libraries needed for this project. **NumPy, nltk, pandas, and PyPDF2** may have to be installed prior to using this program.

In [69]:
import sys
import numpy as np
import nltk
from tika import parser
from nltk.corpus import stopwords

In [70]:
chars = set('0123456789$`!@#$%^&*(){}[]|\":;?><.,~-©')

def extractFiles(link):
    with open(link, 'r') as fin:
        lines = fin.read().splitlines()
    transcript_link = lines[0]
    global positive, negative, uncertainty, litigious, modal_strong, modal_weak
    positive = open(lines[5], 'r').read().splitlines()
    negative = open(lines[4], 'r').read().splitlines()
    uncertainty = open(lines[6], 'r').read().splitlines()
    litigious = open(lines[1], 'r').read().splitlines()
    modal_strong = open(lines[2], 'r').read().splitlines()
    modal_weak = open(lines[3], 'r').read().splitlines()
    return transcript_link, strengthDictionary

In [71]:
def extractText(link):
    raw = parser.from_file(link)
    text = raw['content']
    tokensList = nltk.word_tokenize(text)
    #tokensList = [itr.lower for itr in tokensList]
    return tokensList

In [72]:
def removeSpecialCharacters(tokens):
    filteredTokens = []
    for s in tokens:
        if any((c in chars) for c in s) or s in set(stopwords.words('english')):
            pass
        else:
            filteredTokens.append(s)
    return filteredTokens

In [73]:
def scanStrengthList(tokens):
    #[positive, modal positive, uncertain, m. neg, neg, litigious]
    countArray = [0] * 6
    uncounted = 0
    for token in tokens:
        if token.upper() in positive:
            countArray[0] += 1
        elif token.upper() in modal_strong:
            countArray[1] += 1
        elif token.upper() in uncertainty:
            countArray[2] += 1
        elif token.upper() in modal_weak:
            countArray[3] += 1
        elif token.upper() in negative:
            countArray[4] += 1
        elif token.upper() in litigious:
            countArray[5] += 1
        else:
            uncounted += 1
    return countArray, len(tokens)-uncounted

In [74]:
def calculatePoints(countArray, uncounted):
    score = (countArray[0]*11)+(countArray[1]*6)-(countArray[2]*1)-(countArray[3]*3)-(countArray[4]*6)-(countArray[5]*9)
    if score == 0:
        score = 1
    finalScore = np.log(np.absolute(score/counted))*100
    if score < 0:
        finalScore = finalScore * -1
    return finalScore

In [None]:
def createGraph():    
    for mutation in tempLines:
        plt.figure()

        #mutationMatrix = np.ma.array(mutationMatrix, mask=np.isnan(mutationMatrix))

        plt.plot(hourPoints, wildTypeZeros, c='g', marker="^", label='wildType', markersize=50)

        color = ['b', 'r', 'm', 'y', 'c']
        mark = ['s', 'o', 'h', 'D', '8']

        x = 0
        l=0
        while l < vialNumber:
            label = 'V'+str(l+1)
            x += len(hourPoints)
            fit = np.polyfit(hourPoints, mutationMatrix[x-len(hourPoints):x, z], 1)
            fit_fn = np.poly1d(fit) 
            plt.plot(hourPoints, mutationMatrix[x-len(hourPoints):x, z], c=color[l], marker=mark[l], label=label, markersize=30)
            plt.plot(hourPoints, mutationMatrix[x-len(hourPoints):x, z], 'yo', hourPoints, fit_fn(hourPoints), c=color[l], linewidth=5)
            l += 1

    plt.gca().set_ylim([minValue,maxValue])
    plt.legend(loc='upper left', prop={'size': 45})

    plt.title(mutation, fontsize=40)

    plt.xlabel('Hours', fontsize=40)
    plt.ylabel('log₂(Relative Frequency)', fontsize=40)

    plt.xticks(hourPoints, fontsize=30)
    plt.yticks(fontsize=30)

    plt.show()
    z += 1

**Input**: Here, enter the link to the PDF File of the transcript. The variable will be stored in pdfFile.

In [75]:
pdfFile = input('Enter PDF File directory: ')

transcript_link, strengthDictionary = extractFiles(pdfFile)
tokens = extractText(transcript_link)
filteredTokens = removeSpecialCharacters(tokens)

countArray, counted = scanStrengthList(filteredTokens)

finalScore = calculatePoints(countArray, counted)

print('\n')
if np.round(finalScore, 0) > 0:
    print('Final Score: ', np.round(finalScore, 2), '/100 (RATED POSITIVE)')
elif np.round(finalScore, 0) == 0:
    print('Final Score: ', np.round(finalScore, 2), '/100 (RATED NEUTRAL)')
elif np.round(finalScore, 0) < 0:
    print('Final Score: ', np.round(finalScore, 2), '/100 (RATED NEGATIVE)')
    
print('\nStatistics:\n \t Positive Words:', countArray[0], '\n\t Modal Strong Words:', countArray[1],
      '\n \t Uncertain Words:', countArray[2],'\n \t Modal Weak Words:', countArray[3],'\n \t Negative Words:', countArray[4],
     '\n \t Litigious Words:', countArray[5], '\n \n Words Analyzed: ', counted, '\n Words Ignored: ', len(filteredTokens)-counted, 
    '\n Words Filtered Out: ', len(tokens)-len(filteredTokens))

#/Users/rohitvemuri/Desktop/ProgrammingChallenges/Word_list/Masterfile.txt



Final Score:  -58.48 /100 (RATED NEGATIVE)

Statistics:
 	 Positive Words: 387 
	 Modal Strong Words: 13 
 	 Uncertain Words: 632 
 	 Modal Weak Words: 0 
 	 Negative Words: 675 
 	 Litigious Words: 377 
 
 Words Analyzed:  2084 
 Words Ignored:  26347 
 Words Filtered Out:  28054
