# Text classification using TF-IDF

### 1. Load the dataset from sklearn.datasets

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
import nltk
from nltk.stem.snowball import SnowballStemmer

In [3]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

### 2. Training data

In [4]:
from sklearn.datasets import fetch_20newsgroups

In [5]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

### 3. Test data

In [6]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

###  a.  You can access the values for the target variable using .target attribute 
###  b. You can access the name of the class in the target variable with .target_names


In [7]:
twenty_train.target

array([1, 1, 3, ..., 2, 2, 2], dtype=int64)

In [8]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [9]:
twenty_train.data[0:5]

['From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n',
 "From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\nSubject: help: Splitting a trimming region along a mesh \nOrganization: University Of Kentucky, Dept. of Math Sciences\nLines: 28\n\n\n\n\tHi,\n\n\tI have a problem, I hope some of the 'gurus' can help me solve.\n\n\tBackground of the probl

### 4.  Now with dependent and independent data available for both train and test datasets, using TfidfVectorizer fit and transform the training data and test data and get the tfidf features for both

In [10]:
# TfidfVectorizer
vect = TfidfVectorizer()
x_train_dtm = vect.fit_transform(twenty_train.data)
x_test_dtm = vect.transform(twenty_test.data)

### 5. Use logisticRegression with tfidf features as input and targets as output and train the model and report the train and test accuracy score

In [11]:
from sklearn import metrics

In [12]:
# use logistic regression with text column only
logreg = LogisticRegression()
logreg.fit(x_train_dtm, twenty_train.target)
y_pred_class = logreg.predict(x_test_dtm)
print (metrics.accuracy_score(twenty_test.target, y_pred_class))

0.8868175765645806


## Sentiment analysis <br> 

The objective of this problem is to perform Sentiment analysis from the tweets data collected from the users targeted at various mobile devices.
Based on the tweet posted by a user (text), we will classify if the sentiment of the user targeted at a particular mobile device is positive or not.

### 1. Read the dataset (tweets.csv) and drop the NA's while reading the dataset

In [13]:
import pandas as pd

In [14]:
# read tweet.csv into a DataFrame
tweet = pd.read_csv('tweets.csv',encoding='ISO-8859-1').dropna()

In [15]:
tweet.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [16]:
tweet.shape

(3291, 3)

### 2. Preprocess the text and add the preprocessed text in a column with name `text` in the dataframe.

In [17]:
def preprocess(text):
    try:
        return text.decode('ascii')
    except Exception as e:
        return ""

In [18]:
tweet['text'] = [preprocess(text) for text in tweet.tweet_text]

In [19]:
tweet.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,


### 3. Consider only rows having Positive emotion and Negative emotion and remove other rows from the dataframe.

In [20]:
tweet.groupby('is_there_an_emotion_directed_at_a_brand_or_product').count()

Unnamed: 0_level_0,tweet_text,emotion_in_tweet_is_directed_at,text
is_there_an_emotion_directed_at_a_brand_or_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I can't tell,9,9,9
Negative emotion,519,519,519
No emotion toward brand or product,91,91,91
Positive emotion,2672,2672,2672


In [21]:
tweet_pos_neg = tweet[(tweet.is_there_an_emotion_directed_at_a_brand_or_product == 'Negative emotion') | (tweet.is_there_an_emotion_directed_at_a_brand_or_product == 'Positive emotion')]

In [22]:
tweet_pos_neg.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,


In [23]:
tweet_pos_neg.shape

(3191, 4)

In [24]:
tweet_pos_neg.groupby('is_there_an_emotion_directed_at_a_brand_or_product').count()

Unnamed: 0_level_0,tweet_text,emotion_in_tweet_is_directed_at,text
is_there_an_emotion_directed_at_a_brand_or_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative emotion,519,519,519
Positive emotion,2672,2672,2672


In [25]:
tweet_pos_neg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3191 entries, 0 to 9088
Data columns (total 4 columns):
tweet_text                                            3191 non-null object
emotion_in_tweet_is_directed_at                       3191 non-null object
is_there_an_emotion_directed_at_a_brand_or_product    3191 non-null object
text                                                  3191 non-null object
dtypes: object(4)
memory usage: 124.6+ KB


In [26]:
y = tweet_pos_neg.is_there_an_emotion_directed_at_a_brand_or_product 
X = tweet_pos_neg.tweet_text

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### 4. Represent text as numerical data using `CountVectorizer` and get the document term frequency matrix

#### Use `vect` as the variable name for initialising CountVectorizer.

In [27]:
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [28]:
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

In [46]:
X_train_dtm.shape

(2393, 4919)

In [47]:
X_test_dtm.shape

(798, 4919)

### 5. Find number of different words in vocabulary

In [29]:
print(vect.vocabulary_)

{'tried': 4435, 'installing': 2239, 'mention': 2721, 'on': 2999, 'my': 2858, 'iphone': 2291, 'but': 669, 'it': 2311, 'crashes': 1027, 'every': 1480, 'time': 4347, 'open': 3008, 'sxsw': 4149, 'ipad2': 2285, 'rocks': 3606, 'apple': 315, 'pop': 3244, 'up': 4546, 'store': 4044, 'link': 2520, 'what': 4722, 'your': 4863, 'take': 4191, 'ipad': 2283, 'really': 3452, 'want': 4671, 'checkins': 791, 'aron': 344, 'pilhofer': 3182, 'from': 1736, 'the': 4279, 'new': 2900, 'york': 4860, 'times': 4351, 'just': 2363, 'endorsed': 1418, 'html': 2119, 'over': 3054, 'at': 367, 'newsapps': 2904, 'and': 268, 'asked': 360, 'us': 4562, 'not': 2937, 'to': 4366, 'tweet': 4475, 'he': 2007, 'actually': 162, 'said': 3640, 'lt': 2595, 'guess': 1929, 'who': 4729, 'won': 4787, 'an': 265, 'unsix': 4538, 'tweetup': 4484, 'thanks': 4276, 'amp': 262, 'happydance': 1983, 'pedicab': 3137, 'charger': 776, 'would': 4816, 'be': 482, 'epic': 1448, 'win': 4752, 'data': 1109, 'crunch': 1059, 'is': 2303, 'crippling': 1047, 'google

#### Tip: To see all available functions for an Object use dir

In [30]:
dir(vect)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_limit_features',
 '_sort_features',
 '_validate_vocabulary',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',
 'fit_transform',
 'fixed_vocabulary_',
 'get_feature_names',
 'get_params',
 'get_stop_words',
 'input',
 'inverse_transform',
 'lowercase',
 'max_df',
 'max_features',
 'min_df',
 'ngram_range',
 'preprocessor',
 'set_params',
 'stop_words',
 'stop_word

### 6. Find out how many Positive and Negative emotions are there.

Hint: Use value_counts on that column

In [31]:
tweet_pos_neg['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion    2672
Negative emotion     519
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

### 7. Change the labels for Positive and Negative emotions as 1 and 0 respectively and store in a different column in the same dataframe named 'Label'

Hint: use map on that column and give labels

In [32]:
tweet_pos_neg['Label'] = tweet_pos_neg.is_there_an_emotion_directed_at_a_brand_or_product.map({
    'Negative emotion': 0,
    'Positive emotion': 1,
})

In [33]:
tweet_pos_neg.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text,Label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,,1


### 8. Define the feature set (independent variable or X) to be `text` column and `labels` as target (or dependent variable)  and divide into train and test datasets

In [34]:
y = tweet_pos_neg.Label
X = tweet_pos_neg.tweet_text

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## 9. **Predicting the sentiment:**


### Use Naive Bayes and Logistic Regression and their accuracy scores for predicting the sentiment of the given text

In [35]:
def tokenize_test(vect):
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print ('Naive Bayes - Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)*100)

In [36]:
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  24855
Naive Bayes - Accuracy:  85.58897243107769


In [37]:
def tokenize_test(vect):
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = LogisticRegression()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print ('Logistic - Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)*100)

In [38]:
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  24855
Logistic - Accuracy:  86.59147869674186


## 10. Create a function called `tokenize_predict` which can take count vectorizer object as input and prints the accuracy for x (text) and y (labels)

In [39]:
def tokenize_test(vect):
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print ('Naive Bayes - Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)*100)

### Create a count vectorizer function which includes n_grams = 1,2  and pass it to tokenize_predict function to print the accuracy score

In [40]:
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  24855
Naive Bayes - Accuracy:  85.58897243107769


### Create a count vectorizer function with stopwords = 'english'  and pass it to tokenize_predict function to print the accuracy score

In [41]:
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

Features:  4681
Naive Bayes - Accuracy:  85.33834586466166


### Create a count vectorizer function with stopwords = 'english' and max_features =300  and pass it to tokenize_predict function to print the accuracy score

In [42]:
vect = CountVectorizer(stop_words='english',max_features=300)
tokenize_test(vect)

Features:  300
Naive Bayes - Accuracy:  81.07769423558896


### Create a count vectorizer function with n_grams = 1,2  and max_features = 15000  and pass it to tokenize_predict function to print the accuracy score

In [43]:
vect = CountVectorizer(ngram_range=(1, 2),max_features=15000)
tokenize_test(vect)

Features:  15000
Naive Bayes - Accuracy:  85.33834586466166


### Create a count vectorizer function with n_grams = 1,2  and include terms that appear at least 2 times (min_df = 2)  and pass it to tokenize_predict function to print the accuracy score

In [44]:
vect = CountVectorizer(ngram_range=(1, 2), min_df = 2)
tokenize_test(vect)

Features:  7764
Naive Bayes - Accuracy:  85.83959899749374
