In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
%matplotlib inline

### Text Analytics

In [95]:
df = "The University of North Carolina at Charlotte (UNC Charlotte, UNCC, or simply Charlotte[6]) is a public research university in Charlotte, North Carolina. UNC $Charlotte offers 23 doctoral#, 64 master's, and 140 bachelor's degree programs through nine colleges: the College of Arts + Architecture, the College of Liberal Arts & Sciences, the Belk College of Business, the College of Computing and Informatics, the Cato College of Education, the** William States Lee College of Engineering, the College of Health and Human Services, the Honors College, and the Unive&rsity College.[7] It has three\\\\ campuses: Charlotte Research Institute Campus, Center City Campus, and the main campus, located in University City. The main campus sits on 1,000 wooded acres with approximately 85 buildings about 8 miles (13 km) from Uptown Charlotte.[8] UNC Charlotte is the largest institution of higher education in the Charlotte region. The university has experienced rapid enrollment growth of 33% over the past 10 years, [[[[making it the fastest-growing institution in the UNC System and contributing to more than 50% of the…. system's growth since 2009.[9]"

In [96]:
df

"The University of North Carolina at Charlotte (UNC Charlotte, UNCC, or simply Charlotte[6]) is a public research university in Charlotte, North Carolina. UNC $Charlotte offers 23 doctoral#, 64 master's, and 140 bachelor's degree programs through nine colleges: the College of Arts + Architecture, the College of Liberal Arts & Sciences, the Belk College of Business, the College of Computing and Informatics, the Cato College of Education, the** William States Lee College of Engineering, the College of Health and Human Services, the Honors College, and the Unive&rsity College.[7] It has three\\\\ campuses: Charlotte Research Institute Campus, Center City Campus, and the main campus, located in University City. The main campus sits on 1,000 wooded acres with approximately 85 buildings about 8 miles (13 km) from Uptown Charlotte.[8] UNC Charlotte is the largest institution of higher education in the Charlotte region. The university has experienced rapid enrollment growth of 33% over the pas

In [97]:
df = re.sub(r'[()?#&*!]+|\\|\/','',df)

In [98]:
df

"The University of North Carolina at Charlotte UNC Charlotte, UNCC, or simply Charlotte[6] is a public research university in Charlotte, North Carolina. UNC $Charlotte offers 23 doctoral, 64 master's, and 140 bachelor's degree programs through nine colleges: the College of Arts + Architecture, the College of Liberal Arts  Sciences, the Belk College of Business, the College of Computing and Informatics, the Cato College of Education, the William States Lee College of Engineering, the College of Health and Human Services, the Honors College, and the University College.[7] It has three campuses: Charlotte Research Institute Campus, Center City Campus, and the main campus, located in University City. The main campus sits on 1,000 wooded acres with approximately 85 buildings about 8 miles 13 km from Uptown Charlotte.[8] UNC Charlotte is the largest institution of higher education in the Charlotte region. The university has experienced rapid enrollment growth of 33% over the past 10 years, [

In [99]:
df = re.sub(r'[\s]+',' ',df)

In [100]:
df

"The University of North Carolina at Charlotte UNC Charlotte, UNCC, or simply Charlotte[6] is a public research university in Charlotte, North Carolina. UNC $Charlotte offers 23 doctoral, 64 master's, and 140 bachelor's degree programs through nine colleges: the College of Arts + Architecture, the College of Liberal Arts Sciences, the Belk College of Business, the College of Computing and Informatics, the Cato College of Education, the William States Lee College of Engineering, the College of Health and Human Services, the Honors College, and the University College.[7] It has three campuses: Charlotte Research Institute Campus, Center City Campus, and the main campus, located in University City. The main campus sits on 1,000 wooded acres with approximately 85 buildings about 8 miles 13 km from Uptown Charlotte.[8] UNC Charlotte is the largest institution of higher education in the Charlotte region. The university has experienced rapid enrollment growth of 33% over the past 10 years, [[

#### Sentence Segmentation

In [101]:
df.split('.')

['The University of North Carolina at Charlotte UNC Charlotte, UNCC, or simply Charlotte[6] is a public research university in Charlotte, North Carolina',
 " UNC $Charlotte offers 23 doctoral, 64 master's, and 140 bachelor's degree programs through nine colleges: the College of Arts + Architecture, the College of Liberal Arts Sciences, the Belk College of Business, the College of Computing and Informatics, the Cato College of Education, the William States Lee College of Engineering, the College of Health and Human Services, the Honors College, and the University College",
 '[7] It has three campuses: Charlotte Research Institute Campus, Center City Campus, and the main campus, located in University City',
 ' The main campus sits on 1,000 wooded acres with approximately 85 buildings about 8 miles 13 km from Uptown Charlotte',
 '[8] UNC Charlotte is the largest institution of higher education in the Charlotte region',
 ' The university has experienced rapid enrollment growth of 33% over 

In [102]:
df = nltk.sent_tokenize(df)

In [103]:
df[0:5]

['The University of North Carolina at Charlotte UNC Charlotte, UNCC, or simply Charlotte[6] is a public research university in Charlotte, North Carolina.',
 "UNC $Charlotte offers 23 doctoral, 64 master's, and 140 bachelor's degree programs through nine colleges: the College of Arts + Architecture, the College of Liberal Arts Sciences, the Belk College of Business, the College of Computing and Informatics, the Cato College of Education, the William States Lee College of Engineering, the College of Health and Human Services, the Honors College, and the University College.",
 '[7] It has three campuses: Charlotte Research Institute Campus, Center City Campus, and the main campus, located in University City.',
 'The main campus sits on 1,000 wooded acres with approximately 85 buildings about 8 miles 13 km from Uptown Charlotte.',
 '[8] UNC Charlotte is the largest institution of higher education in the Charlotte region.']

In [104]:
len(df)

8

### 3: Word Tokenization

In [105]:
tokenized_sentences = []
for sentence in df:
    tokenized_sentences.append(nltk.word_tokenize(sentence))

In [106]:
tokenized_sentences[0]

['The',
 'University',
 'of',
 'North',
 'Carolina',
 'at',
 'Charlotte',
 'UNC',
 'Charlotte',
 ',',
 'UNCC',
 ',',
 'or',
 'simply',
 'Charlotte',
 '[',
 '6',
 ']',
 'is',
 'a',
 'public',
 'research',
 'university',
 'in',
 'Charlotte',
 ',',
 'North',
 'Carolina',
 '.']

### Parts of Speech Tagging

In [107]:
nltk.pos_tag(nltk.word_tokenize('UNCC Campus'))

[('UNCC', 'NNP'), ('Campus', 'NNP')]

In [108]:
nltk.pos_tag(tokenized_sentences[3])

[('The', 'DT'),
 ('main', 'JJ'),
 ('campus', 'NN'),
 ('sits', 'NNS'),
 ('on', 'IN'),
 ('1,000', 'CD'),
 ('wooded', 'VBD'),
 ('acres', 'NNS'),
 ('with', 'IN'),
 ('approximately', 'RB'),
 ('85', 'CD'),
 ('buildings', 'NNS'),
 ('about', 'IN'),
 ('8', 'CD'),
 ('miles', 'NNS'),
 ('13', 'CD'),
 ('km', 'NN'),
 ('from', 'IN'),
 ('Uptown', 'NNP'),
 ('Charlotte', 'NNP'),
 ('.', '.')]

#### Lemmatization

In [117]:
lemmatized_sentence =[]
for sentence in tokenized_sentences:
    temp = []
    for word in sentence:
        temp.append(wordnet.lemmatize(word))
        temp.append(" ")
    
    lemmatized_sentence.append(''.join(temp))

#### Stop Words Removal 

In [118]:
stopwords_removed = []
for sentence in lemmatized_sentence:
    sentence = sentence.split()
    temp = []
    for word in sentence:
        
        if word not in stopwords.words():
            temp.append(word)
           
    stopwords_removed.append(' '.join(temp))

In [119]:
stopwords_removed

['The University North Carolina Charlotte UNC Charlotte , UNCC , simply Charlotte [ 6 ] public research university Charlotte , North Carolina .',
 "UNC $ Charlotte offer 23 doctoral , 64 master 's , 140 bachelor 's degree program nine college : College Arts + Architecture , College Liberal Arts Sciences , Belk College Business , College Computing Informatics , Cato College Education , William States Lee College Engineering , College Health Human Services , Honors College , University College .",
 '[ 7 ] It three campus : Charlotte Research Institute Campus , Center City Campus , main campus , located University City .',
 'The main campus sits 1,000 wooded acre approximately 85 building 8 mile 13 km Uptown Charlotte .',
 '[ 8 ] UNC Charlotte largest institution higher education Charlotte region .',
 'The university experienced rapid enrollment growth 33 % past 10 year , [ [ [ [ making fastest-growing institution UNC System contributing 50 % the… .',
 "system 's growth since 2009 .",
 '[