In [28]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import  MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn import svm
import string
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [29]:
news = fetch_20newsgroups(subset='all')

In [30]:
print(len(news.data))

18846


In [31]:
print(len(news.target_names))

20


In [32]:
print(news.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [33]:
def train_test(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 48)
    classifier.fit(X_train, y_train)
    print(f"Classifier testing accuracy: {classifier.score(X_test, y_test)}")
    
    print(f"Classifier training accuracy: {classifier.score(X_train, y_train)}")
    return classifier

In [34]:
trial1 = Pipeline([("vectorizer", TfidfVectorizer()),
                    ("classifier", MultinomialNB())])


In [35]:
train_test(trial1, news.data, news.target)

Classifier testing accuracy: 0.8535653650254669
Classifier training accuracy: 0.9254987972265459


Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', MultinomialNB())])

In [None]:
trial2 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words('english'))),
                    ("classifier", MultinomialNB())])

In [None]:
train_test(trial2, news.data, news.target)

Classifier testing accuracy: 0.8828522920203735
Classifier training accuracy: 0.9472194707796802


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', MultinomialNB())])

In [None]:
trial3 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words('english'))), 
                   ("classifier", MultinomialNB(alpha=0.05))])

In [None]:
train_test(trial3, news.data, news.target)

Classifier testing accuracy: 0.91553480475382
Classifier training accuracy: 0.9898118013301259


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', MultinomialNB(alpha=0.05))])

In [None]:
trail4 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words("english") + list(string.punctuation), min_df = 5)),
                    ("classifier", svm.LinearSVC())])

In [None]:
train_test(trail4, news.data, news.target)

Classifier testing accuracy: 0.9276315789473685
Classifier training accuracy: 0.998584972406962


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(min_df=5,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', LinearSVC())])

In [None]:
train5 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words("english") + list(string.punctuation), min_df = 5)),
                    ("classifier", RandomForestClassifier(random_state = 42))])

In [None]:
train_test(train5, news.data, news.target)

Classifier testing accuracy: 0.8535653650254669
Classifier training accuracy: 0.999929248620348


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(min_df=5,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', RandomForestClassifier(random_state=42))])

In [None]:
train6 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words("english") + list(string.punctuation), min_df = 5)),
                    ("classifier", xgb.XGBClassifier(objective='multi:softmax', num_class=3, seed=42))])

In [None]:
train_test(train6, news.data, news.target)

Classifier testing accuracy: 0.8486842105263158
Classifier training accuracy: 0.999929248620348


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(min_df=5,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier',
                 XGBClassifier(base_score=None, booster=...
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshol

In [None]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
     -------------------------------------- 232.6/232.6 kB 2.4 MB/s eta 0:00:00
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
%pip install textract

Collecting textract
  Downloading textract-1.6.5-py3-none-any.whl (23 kB)
Collecting pdfminer.six==20191110
  Downloading pdfminer.six-20191110-py2.py3-none-any.whl (5.6 MB)
     ---------------------------------------- 5.6/5.6 MB 6.5 MB/s eta 0:00:00
Collecting python-pptx~=0.6.18
  Downloading python-pptx-0.6.21.tar.gz (10.1 MB)
     ---------------------------------------- 10.1/10.1 MB 7.6 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting xlrd~=1.2.0
  Downloading xlrd-1.2.0-py2.py3-none-any.whl (103 kB)
     -------------------------------------- 103.3/103.3 kB 2.9 MB/s eta 0:00:00
Collecting beautifulsoup4~=4.8.0
  Downloading beautifulsoup4-4.8.2-py3-none-any.whl (106 kB)
     -------------------------------------- 106.9/106.9 kB 6.0 MB/s eta 0:00:00
Collecting docx2txt~=0.8
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished wi

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.2.2 requires pyqt5<5.13, which is not installed.
spyder 5.2.2 requires pyqtwebengine<5.13, which is not installed.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.
anaconda-client 1.11.0 requires six>=1.15.0, but you have six 1.12.0 which is incompatible.


In [79]:
import PyPDF2

In [80]:
pdfFileObj = open('In-class_Assignment-3 (1).pdf', 'rb')

In [81]:
pdfReader = PyPDF2.PdfReader(pdfFileObj)

In [82]:
print(len(pdfReader.pages))

2


In [83]:
pageObj = pdfReader.pages[0]

In [84]:
print(pageObj.extract_text())

In-Class -Assigment 3:   
Statistic Analysis  
Assignment Type:  Individual lab work  
Weight : 5% 
Time : In Class/ …../  
Tasks :  
In this assignment, you will continue working on your selected data source from the previous 
assignment and present data set statistics  and practice data visualization.  
For each selected column compute : 
1. Range  
2. Interquartile  range  (IQR)  
3. Median  
4. Mean  
5. Mode  
6. 1st and 3rd  quartile s 
7. Sample variance and sample standard deviation  
8. Find Outliers   
Draw the following  diagrams:  
1. Box Plot  
2. Sea-born Simplot  
3. Linear regression  
4. HeatMap  
Perform statistical analysis and present  results  in graphical format . 
Deliverables:  
a) Submit a professional r eport  including at least:   
a. Explain  the steps/formulas that have been used for calculation process  


In [85]:
#pdfFileObj.close()

In [86]:
c = 0
text = ""

while c< len(pdfReader.pages):
    pageObj = pdfReader.pages[c]
    c+=1
    text += pageObj.extract_text()

print(text)

In-Class -Assigment 3:   
Statistic Analysis  
Assignment Type:  Individual lab work  
Weight : 5% 
Time : In Class/ …../  
Tasks :  
In this assignment, you will continue working on your selected data source from the previous 
assignment and present data set statistics  and practice data visualization.  
For each selected column compute : 
1. Range  
2. Interquartile  range  (IQR)  
3. Median  
4. Mean  
5. Mode  
6. 1st and 3rd  quartile s 
7. Sample variance and sample standard deviation  
8. Find Outliers   
Draw the following  diagrams:  
1. Box Plot  
2. Sea-born Simplot  
3. Linear regression  
4. HeatMap  
Perform statistical analysis and present  results  in graphical format . 
Deliverables:  
a) Submit a professional r eport  including at least:   
a. Explain  the steps/formulas that have been used for calculation process  b. Detailed explanations of your solution including snippets of your source code.  
c. Describe results from statistical  analysis and diagrams . 
d. Your 

In [87]:
import re
text = ""
pageObj = pdfReader.pages[0]
text += pageObj.extract_text()

print(text.split('\n')[0])

print(re.findall(r".*:", text)[0])

In-Class -Assigment 3:   
In-Class -Assigment 3:


In [88]:
page_one = pdfReader.pages[0]
text += pageObj.extract_text()
print(page_one)
print(text)

{'/Type': '/Page', '/Parent': IndirectObject(2, 0, 1965144482768), '/Resources': {'/Font': {'/F1': IndirectObject(5, 0, 1965144482768), '/F2': IndirectObject(12, 0, 1965144482768), '/F3': IndirectObject(14, 0, 1965144482768), '/F4': IndirectObject(19, 0, 1965144482768), '/F5': IndirectObject(21, 0, 1965144482768)}, '/ExtGState': {'/GS10': IndirectObject(10, 0, 1965144482768), '/GS11': IndirectObject(11, 0, 1965144482768)}, '/ProcSet': ['/PDF', '/Text', '/ImageB', '/ImageC', '/ImageI']}, '/MediaBox': [0, 0, 612, 792], '/Contents': IndirectObject(4, 0, 1965144482768), '/Group': {'/Type': '/Group', '/S': '/Transparency', '/CS': '/DeviceRGB'}, '/Tabs': '/S', '/StructParents': 0}
In-Class -Assigment 3:   
Statistic Analysis  
Assignment Type:  Individual lab work  
Weight : 5% 
Time : In Class/ …../  
Tasks :  
In this assignment, you will continue working on your selected data source from the previous 
assignment and present data set statistics  and practice data visualization.  
For each 

In [89]:
pdf_document_writer = PyPDF2.PdfWriter()

In [90]:
pdf_document_writer.add_page(page_one)

{'/Type': '/Page',
 '/Resources': {'/Font': {'/F1': {'/Type': '/Font',
    '/Subtype': '/Type0',
    '/BaseFont': '/TimesNewRomanPS-BoldMT',
    '/Encoding': '/Identity-H',
    '/DescendantFonts': [IndirectObject(7, 0, 1965130618864)],
    '/ToUnicode': {'/Filter': '/FlateDecode'}},
   '/F2': {'/Type': '/Font',
    '/Subtype': '/TrueType',
    '/Name': '/F2',
    '/BaseFont': '/TimesNewRomanPS-BoldMT',
    '/Encoding': '/WinAnsiEncoding',
    '/FontDescriptor': {'/Type': '/FontDescriptor',
     '/FontName': '/TimesNewRomanPS-BoldMT',
     '/Flags': 32,
     '/ItalicAngle': 0,
     '/Ascent': 891,
     '/Descent': -216,
     '/CapHeight': 677,
     '/AvgWidth': 427,
     '/MaxWidth': 2558,
     '/FontWeight': 700,
     '/XHeight': 250,
     '/Leading': 42,
     '/StemV': 42,
     '/FontBBox': [-558, -216, 2000, 677]},
    '/FirstChar': 32,
    '/LastChar': 32,
    '/Widths': [250]},
   '/F3': {'/Type': '/Font',
    '/Subtype': '/Type0',
    '/BaseFont': '/TimesNewRomanPSMT',
    '/Encod

In [91]:
pdf_output = open("new_file_2.pdf", 'wb')

In [92]:
pdf_document_writer.write(pdf_output)

(False, <_io.BufferedWriter name='new_file_2.pdf'>)

In [93]:
pdfFileObj.close()
pdf_document_writer.close()
pdf_output.close()