Using the BBC collection in question, combine the 1 and -1 classes into one and perform classification using the SVM (linear kernel) method, comparing the subjective class (1 and -1) with the objective class (0). Conduct the research for reasonably similar numbers of posts from each class (e.g., 500 and 500). 

In [5]:
# importing libraries
import pandas as pd
import numpy as np
import tmtoolkit as tm
from sklearn.model_selection import train_test_split
from sklearn import svm

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
import matplotlib.pyplot as plt

import kagglehub
from kagglehub import KaggleDatasetAdapter

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
file_path = "synthetic_social_media_data.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "abdullah0a/social-media-sentiment-analysis-dataset",
  file_path
)

df = df.rename(columns={'Sentiment Label': 'Sentiment', 'Post Content': 'Content'})
df = df[['Sentiment', 'Content']]
df = df.sort_values(by = 'Sentiment').reset_index(drop=True)
df['Sentiment'] = df['Sentiment'].map({'Positive': 1, 'Negative': -1, 'Neutral': 0})
df

  df = kagglehub.load_dataset(


Unnamed: 0,Sentiment,Content
0,-1,Place now sing question drug wide size. What a...
1,-1,Member join when really ok country. Wife indic...
2,-1,Place policy myself forward. Involve defense e...
3,-1,Over church nor off. Manage out without far me...
4,-1,Art hundred contain tonight. Meeting partner a...
...,...,...
1995,1,Ball could hospital oil moment democratic some...
1996,1,Sit five stop actually. Hold hold worker here ...
1997,1,Future tend at less part appear across. Still ...
1998,1,Rest keep citizen civil loss remain. Find deca...


In [53]:
# get comments with strong opinion
df_opinion = pd.concat([df.head(300), df.tail(300)])
df_opinion['Sentiment']=1

df_neutral = df.loc[df['Sentiment']==0].sample(n=600)

df_full = pd.concat([df_opinion,df_neutral])
df_full

Unnamed: 0,Sentiment,Content
0,1,Place now sing question drug wide size. What a...
1,1,Member join when really ok country. Wife indic...
2,1,Place policy myself forward. Involve defense e...
3,1,Over church nor off. Manage out without far me...
4,1,Art hundred contain tonight. Meeting partner a...
...,...,...
983,0,Fly appear president wait million. Account fir...
1072,0,Ask interest happen kid. Hand station must fea...
1333,0,Inside very run happy lose minute maintain. Qu...
1123,0,List support behavior make contain sport lay c...


In [63]:
corp = tm.corpus.Corpus(dict(zip(range(1200), df_full.Content)), language = "en", load_features=[])
tm.corpus.set_document_attr(corp, attrname = "Sentiment", data = dict(zip(corp.doc_labels, df_full.Sentiment)))

tm.corpus.print_summary(corp)


Corpus with 1200 documents in English
> 0 (43 tokens): Place now sing question drug wide size . What alon...
> 1 (53 tokens): Member join when really ok country . Wife indicate...
> 2 (34 tokens): Place policy myself forward . Involve defense expe...
> 3 (39 tokens): Over church nor off . Manage out without far medic...
> 4 (47 tokens): Art hundred contain tonight . Meeting partner alon...
> 5 (31 tokens): Specific recognize defense almost decide security ...
> 6 (41 tokens): Film stop person work lose oil . Budget seem enoug...
> 7 (37 tokens): Public throughout wear compare your return . Share...
> 8 (41 tokens): Current write once from little nature . Meet yours...
> 9 (42 tokens): Report send lawyer more . Their pay need sometimes...
(and 1190 more documents)
total number of tokens: 50029 / vocabulary size: 1941


In [65]:
tm.corpus.filter_clean_tokens(corp, remove_numbers=True)
tm.corpus.to_lowercase(corp)

tm.corpus.print_summary(corp)

Corpus with 1200 documents in English
> 0 (28 tokens): place sing question drug wide size writer reduce w...
> 1 (36 tokens): member join ok country wife indicate open crime pr...
> 2 (24 tokens): place policy forward involve defense expect dream ...
> 3 (19 tokens): church manage far medical outside present picture ...
> 4 (29 tokens): art contain tonight meeting partner fish affect ba...
> 5 (19 tokens): specific recognize defense decide security near st...
> 6 (28 tokens): film stop person work lose oil budget firm confere...
> 7 (24 tokens): public wear compare return share fund source parti...
> 8 (26 tokens): current write little nature meet adult size floor ...
> 9 (25 tokens): report send lawyer pay need behavior pull action u...
(and 1190 more documents)
total number of tokens: 31833 / vocabulary size: 762


In [75]:
mat, doc_labels, vocab = tm.corpus.dtm(corp, return_doc_labels=True, return_vocab=True)
z = tm.bow.dtm.dtm_to_dataframe(mat, doc_labels, vocab).T

z

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1190,1191,1192,1193,1194,1195,1196,1197,1198,1199
a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ability,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
able,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accept,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
according,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yard,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yeah,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
year,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yes,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0


In [81]:
ind_words = z.sum(axis = 1) >= 5

print(sum(ind_words))

761


In [77]:
print(mat)

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 31303 stored elements and shape (1200, 762)>
  Coords	Values
  (0, 35)	1
  (0, 80)	1
  (0, 97)	1
  (0, 101)	1
  (0, 152)	1
  (0, 177)	1
  (0, 194)	1
  (0, 309)	1
  (0, 331)	1
  (0, 346)	1
  (0, 373)	1
  (0, 381)	1
  (0, 415)	1
  (0, 445)	1
  (0, 453)	1
  (0, 486)	1
  (0, 499)	1
  (0, 530)	1
  (0, 544)	1
  (0, 550)	1
  (0, 611)	1
  (0, 613)	1
  (0, 617)	1
  (0, 694)	1
  (0, 717)	1
  :	:
  (1198, 739)	1
  (1199, 14)	1
  (1199, 39)	1
  (1199, 46)	1
  (1199, 102)	1
  (1199, 121)	1
  (1199, 128)	1
  (1199, 199)	1
  (1199, 206)	1
  (1199, 226)	1
  (1199, 240)	1
  (1199, 251)	1
  (1199, 274)	1
  (1199, 279)	1
  (1199, 330)	1
  (1199, 342)	1
  (1199, 355)	1
  (1199, 364)	1
  (1199, 436)	1
  (1199, 497)	1
  (1199, 519)	1
  (1199, 552)	1
  (1199, 620)	1
  (1199, 667)	1
  (1199, 678)	1


In [79]:
mat1 = mat[:,ind_words]


# z1 = tm.bow.dtm.dtm_to_dataframe(mat1, doc_labels, np.array(vocab)[ind_words]).T

# ind_docs = z1.sum(axis = 0) >= 5

# mat2 = mat1[ind_docs,:]

# z2 = tm.bow.dtm.dtm_to_dataframe(mat2, np.array(doc_labels)[ind_docs], np.array(vocab)[ind_words])

# z2.T

AttributeError: 'Series' object has no attribute 'nonzero'