**LOAD DATASET**

In [13]:
import pandas as pd

train = pd.read_csv(r'C:\Users\KIIT\Downloads\Train.csv')
test = pd.read_csv(r'C:\Users\KIIT\Downloads\Test.csv')

In [17]:
train_indices = train.shape[0]
test_indices = test.shape[0]
print("train_indices, test_indices = ", train_indices, test_indices )
total = pd.concat([train, test], axis=0, ignore_index = True)
print("total.shape = ",total.shape)

train_indices, test_indices =  40000 5000
total.shape =  (45000, 2)


**PREPROCESSING** \
1.Removing special characters\
2.Tokenize and remove stopwords\
3.Stemming\
4.Lemmatizing

In [19]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

def preprocess(total):


    def preprocessing(text):    
        text = re.sub('<[^>]*>','',text)
        text = re.sub('[\W+^]',' ',text) 
        return text
    
    stop = set(stopwords.words('english'))
    def my_swremove(text):
        text=word_tokenize(text)
        new_text =[ words.lower() for words in text if words not in stop]
        return new_text

    port =  PorterStemmer()
    def my_stemmer(text):

        stemmed = [ port.stem(words) for words in text]
        return stemmed

    lem = WordNetLemmatizer()
    def my_lemmatizer(text):
        lem_text = [ lem.lemmatize(words) for words in text ]
        return ' '.join(lem_text)
    
    total['text'] = total['text'].apply(preprocessing)
    total['text'] = total['text'].apply(my_swremove)
    total['text'] = total['text'].apply(my_stemmer)
    total['text'] = total['text'].apply(my_lemmatizer)
    
    return total

**TF_IDF VECTOR**

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

def vectorize(text):
    tfid = TfidfVectorizer( smooth_idf = True, use_idf = True, preprocessor = None)
    X = tfid.fit_transform(text)
    saved_tfidf = open('saved_tfidf.sav', 'wb')
    pickle.dump(tfid , saved_tfidf)
    saved_tfidf.close()
    return X

**APPLYING ALL FUNCTIONS TO DATASET**

In [22]:
total = preprocess(total)
total['text'][0]

'i grew b 1965 watch love thunderbird all mate school watch we play thunderbird school lunch school we want virgil scott no one want alan count 5 becam art form i took child see movi hope would get glimps i love child how bitterli disappoint the high point snappi theme tune not could compar origin score thunderbird thank earli saturday morn one televis channel still play rerun seri gerri anderson wife creat jonatha frake hand director chair version complet hopeless a wast film utter rubbish a cgi remak may accept replac marionett homo sapien subsp sapien huge error judgment'

In [53]:
X = vectorize(total['text'])
y = total['label']


**TRAIN TEST SPLIT**

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 1/8)

In [55]:
import pickle
from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV(
    cv = 4,
    scoring = 'accuracy',
    verbose = 3,
    max_iter = 300,
    n_jobs = -1
)

lr.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.7min finished


LogisticRegressionCV(Cs=10, class_weight=None, cv=4, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=300, multi_class='warn', n_jobs=-1, penalty='l2',
                     random_state=None, refit=True, scoring='accuracy',
                     solver='lbfgs', tol=0.0001, verbose=3)

In [56]:
saved_model = open( 'saved_model.sav', 'wb')
pickle.dump( lr, saved_model )
saved_model.close()

**EVALUATION**

In [71]:
print("score = ", str(round(lr.score( X_test, y_test )*100,2)) + "%")

score =  89.53%


__________________________________________________________________________________________________________________________

In [72]:
import pickle

model = pickle.load(open( 'saved_model.sav' , 'rb' ))
tfidf = pickle.load(open( 'saved_tfidf.sav' , 'rb' ))

In [90]:
s = [" It was really good actually", "Truth to be told, it was bad." ]
test_df = pd.DataFrame(s)
test_df.columns = ['text']
test_df = preprocess(test_df)
vec_test = tfidf.transform(test_df['text'])
list(model.predict ( vec_test ))

[1, 0]