In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv' )
dft = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv' )

In [None]:
import re
def clean(text):
    res = re.sub(r'http(s)?:\/\/([\w\.\/])*' ,' ',text) # clean url:  http://x.x.x.x/xxx
    res = re.sub('[0-9]+', '', res) # clean numbers
    res = re.sub(r'[!"#$%&()*+,-./:;=?@\\^_`"~\t\n\<\>\[\]\{\}]',' ',res) # clean special chars
    res = re.sub(r'  +',' ',res) #  multiple blank chars to a single blank char 。
    return res.strip()

In [None]:
df['text'] = df['text'].apply(clean)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier(max_iter=2000, tol=5e-4)),
    ]
)

param_grid = {
    "clf__max_iter": [2000,3000,4000 ],
    "clf__tol": [1e-2, 1e-3, 1e-4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)

In [None]:
grid_search.fit(df['text'],df['target'])
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_clf__max_iter",
        "param_clf__tol"
    ]
].head(5)

In [None]:
print("Best params:")
print(grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")

In [None]:
dft['text'] = dft['text'].apply(clean)
predictions = grid_search.predict(dft['text'])

In [None]:
submission_df = {"id":dft['id'], "target":predictions} 
submission = pd.DataFrame(submission_df)
submission.to_csv('submission.csv',index=False)